def MinMaxScaler_ano_score():
    print("calculate ano score between normal and anomaly")

    def create_score_data(model, images):
        pred_images = model.predict(images)  # train images
        score_data = pred_images.reshape((len(pred_images), -1))
        return score_data

    train_normal = create_score_data(metricLR_model, x_normal)
    test_normal = create_score_data(metricLR_model,
                                    x_test_normal)  # test anomaly
    test_ano = create_score_data(metricLR_model, x_ano)  # test normal
    print(train_a.shape, test_a.shape, test_b.shape)

    # MinMaxScaler

    ms = MinMaxScaler()
    train_normal = ms.fit_transform(train_normal)
    clf = LocalOutlierFactor(n_neighbors=5)
    clf.fit(train_normal)

    test_normal = ms.transform(test_normal)
    test_ano = ms.transform(test_ano)
    Z1 = -clf._decision_function(test_normal)
    Z2 = -clf._decision_function(test_ano)
    print('ano score {}, normal score {}'.format(sum(Z1), sum(Z2)))
Exemplo n.º 2
0
class LocalOutlierFactorScore(GraphScore):
    def __init__(self,
                 beta_matrix,
                 database_name,
                 window_size=None,
                 n_neighbors=40):
        self._split = beta_matrix.shape[
            0] if window_size is None else window_size
        self._clf = LocalOutlierFactor(n_neighbors=n_neighbors,
                                       contamination='auto')
        super(LocalOutlierFactorScore, self).__init__(beta_matrix,
                                                      database_name)

    def _calc_score(self):
        num_graphs, num_ftr = self._beta_matrix.shape
        interval = self._split
        self._clf.fit(self._beta_matrix[:interval - 1])

        for graph_k in range(num_graphs):
            if graph_k >= interval:
                from_graph = graph_k - interval
                to_graph = graph_k
                self._clf.fit(self._beta_matrix[from_graph:to_graph])
            self._scores[graph_k] = self._clf._decision_function(
                [self._beta_matrix[graph_k]])[0]
Exemplo n.º 3
0
class LocalOutlierFactorAD(ADModel):
  def __init__(self):
    super().__init__()  
    self.clf = None
    self.scaler = None
    # Model Hyperparams
    thresholds = np.arange(-0.5, 0.5, 0.25)
    contamination = [0.05]
    nn = [20]
    self.params = [(c, n, t) for n in nn for c in contamination for t in thresholds]
    self.contamination = None
    self.n_neighbors = None
    self.threshold = None

  def config(self, hyperparam_tuple):
    contamination, nearest_n, thresh = hyperparam_tuple
    self.contamination = contamination
    self.n_neighbors = nearest_n
    self.threshold = thresh
  
  def train(self, X, y, verbose = False):
    # Scale features
    self.scaler = StandardScaler()
    self.scaler.fit(X)
    self.clf = LocalOutlierFactor(contamination = self.contamination, n_neighbors = self.n_neighbors)
    X_scaled = self.scaler.transform(X)
    self.clf.fit(X_scaled)

  def predict(self, X, **kwargs):
    preds = self.clf._decision_function(self.scaler.transform(X))
    print(preds)
    preds = (preds < self.threshold).astype(np.int32).reshape(-1, 1)
    print(preds)
    return preds
Exemplo n.º 4
0
    def perform_outlier_detection(self, X, len_priors):
        # LOF on all features
        clf = LocalOutlierFactor(n_neighbors=20)
        clf.fit(X)
        check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"])
        if X is not None:
            X = check_array(X, accept_sparse='csr')
            y_pred = clf._decision_function(X)
        else:
            y_pred = clf.negative_outlier_factor_
        #lof_scores = y_pred[len_priors:]
        #lof_scores = zip(self.current_level_users, y_pred_new)
        lof_scores = y_pred

        # Isolation forest on all features
        clf = IsolationForest()
        clf.fit(X)
        y_pred = clf.decision_function(X)
        #forest_scores = y_pred[len_priors:]
        #forest_scores = zip(self.current_level_users, y_pred_new)
        forest_scores = y_pred

        scores = self.combine(lof_scores, forest_scores)
        new_scores = scores[len_priors:]
        user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True)
        threshold = np.percentile(new_scores, 95)
        outliers = [u[0] for u in user_scores if u[1] >= threshold]
        return outliers
    def perform_outlier_detection(self, X):
        # LOF on all features

        clf = LocalOutlierFactor(n_neighbors=20)
        clf.fit(X)
        lof_scores = clf._decision_function(X)
        lof_scores = clf._decision_function(X)

        # Isolation forest on all features
        clf = IsolationForest()
        clf.fit(X)
        forest_scores = clf.decision_function(X)
        '''
        clf = DBOD()
        clf.fit(X)
        distance_scores = clf.decision_function_distance(X)

        #abod_scores = ABOD(X, self.seed_user)
        abod_scores = clf.decision_function_angle(X)

        scores = self.combine([lof_scores, forest_scores, distance_scores, abod_scores])
        '''
        # scores = forest_scores
        scores = self.combine([lof_scores, forest_scores])
        '''
        with open('clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f:
            for score in scores:
                f.write(str(score) + '\n')
                '''
        new_scores = scores[self.len_priors:]
        user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True)
        threshold = np.percentile(new_scores, 8)
        outliers = [u[0] for u in user_scores if u[1] <= threshold]
        return outliers
Exemplo n.º 6
0
def run_samples():
    small_data = read_matlab_data('data/server_latency_throughput.mat')

    X = small_data.get('X')
    model = LocalOutlierFactor(n_neighbors=20)
    y_predict = model.fit_predict(X)
    outliers = np.where(y_predict == -1)
    inliers = np.where(y_predict == 1)
    print("Number of inliers= ", inliers[0].size)
    print("Number of outliers= ", outliers[0].size)

    n = np.arange(0, 35.5, 0.5)
    xx, yy = np.meshgrid(n, n)
    Z = model._decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.title("Local Outlier Factor (LOF)")
    plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
    in_data = plt.scatter(X[inliers, 0],
                          X[inliers, 1],
                          c='white',
                          edgecolor='k',
                          s=20,
                          label='inliers')

    out_data = plt.scatter(X[outliers, 0],
                           X[outliers, 1],
                           c='red',
                           edgecolor='k',
                           s=20,
                           label='outliers')

    plt.legend(handles=[in_data, out_data], loc="upper left")
    plt.show()
Exemplo n.º 7
0
class LOF:
    def fit(self, X):
        self._lof = LocalOutlierFactor(n_neighbors=16, n_jobs=-1).fit(X[:4096])
        return self

    def anomaly_scores(self, X):
        return -self._lof._decision_function(X)
def ano_detect(flow, err, stfeature, label):
    """ """

    # FAL
    points =  np.concatenate([stfeature, err], axis=1)
    detector = LocalOutlierFactor(n_neighbors=100)
    detector.fit(points)
    ano_scores = - detector._decision_function(points)
    compute_metrics(ano_scores, label, "FAL")

    # LOF
    points = flow
    detector = LocalOutlierFactor(n_neighbors=100)
    detector.fit(points)
    ano_scores = - detector._decision_function(points)
    compute_metrics(ano_scores, label, "LOF")
Exemplo n.º 9
0
def LOF_score(S):
    X = np.array(S)
    clf = LocalOutlierFactor()
    clf.fit(X)
    factores = clf._decision_function(X)
    for i in range(len(factores)):
        factores[i] = -1 * factores[i]
    return factores
Exemplo n.º 10
0
def regulof(X):
    from sklearn.neighbors import LocalOutlierFactor
    clf = LocalOutlierFactor(n_neighbors=50)
    y = clf.fit_predict(X)
    xx, yy = np.meshgrid(np.linspace(-1, 1, 500), np.linspace(-1, 1, 500))
    Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    return y
Exemplo n.º 11
0
def LOF(data, predict, k):
    
    clf = LocalOutlierFactor(n_neighbors=k+1, algorithm='auto', contamination=0.1,n_jobs=-1)
    clf.fit(data)
    predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1)
    predict['local outlier factor'] = -clf._decision_function(predict.iloc[:,:-1])
    
    return predict
Exemplo n.º 12
0
class LOF(AbstractDetector):
    name = "LOF"
    data_type = "REAL"

    def compute_scores(self, dataframe: pd.DataFrame, classes: np.array):
        bin_dataframe = dataframe._binarize_categorical_values()

        self.clf = LocalOutlierFactor(**self.settings)
        self.clf.fit(bin_dataframe.values)
        self.values = self.clf._decision_function(bin_dataframe.values)
        return self
Exemplo n.º 13
0
def localoutlierfactor(data, predict, k):
    lof_clf = LocalOutlierFactor(n_neighbors=k + 1,
                                 contamination=0.2,
                                 n_jobs=-1)
    lof_clf.fit(data)
    # 记录 k 邻域距离
    predict['k_distances'] = lof_clf.kneighbors(predict)[0].max(axis=1)
    # 记录 LOF 离群因子,做相反数处理
    predict['local_outlier_factor'] = -lof_clf._decision_function(
        predict.iloc[:, :-1])
    return predict
Exemplo n.º 14
0
def LOF_anomaly_score(x):
    """To figure out anomaly scores."""
    # must calibrate it for all measurements
    outliers = []
    outliers_list = []
    for i, j in x:
        pd_i = pd.DataFrame(i)
        method = 1
        k = 30
        clf = LocalOutlierFactor(n_neighbors=k,
                                 algorithm='auto',
                                 contamination=0.1,
                                 n_jobs=-1)
        clf.fit(pd_i)
        # Record k neighborhood distance
        pd_i['k distances'] = clf.kneighbors(pd_i)[0].max(axis=1)
        # Record LOF factor,take negative
        pd_i['local outlier factor'] = -clf._decision_function(
            pd_i.iloc[:, :-1])
        # Separate group points and normal points according to the threshold
        outliers = pd_i[pd_i['local outlier factor'] > method].sort_values(
            by='local outlier factor')
        inliers = pd_i[pd_i['local outlier factor'] <= method].sort_values(
            by='local outlier factor')
        # Figure
        plt.rcParams['axes.unicode_minus'] = False  # display the negative sign
        plt.figure(figsize=(8, 4)).add_subplot(111)
        plt.scatter(pd_i[pd_i['local outlier factor'] > method].index,
                    pd_i[pd_i['local outlier factor'] > method]
                    ['local outlier factor'],
                    c='red',
                    s=50,
                    marker='.',
                    alpha=None,
                    label='outliers')
        plt.scatter(pd_i[pd_i['local outlier factor'] <= method].index,
                    pd_i[pd_i['local outlier factor'] <= method]
                    ['local outlier factor'],
                    c='black',
                    s=50,
                    marker='.',
                    alpha=None,
                    label='inliers')
        plt.hlines(method, -2, 2 + max(pd_i.index), linestyles='--')
        plt.xlim(-2, 2 + max(pd_i.index))
        plt.title(f'LOF Local outlier detection of {j}', fontsize=13)
        plt.ylabel('Anamoly Score', fontsize=15)  # Local outlier Factors
        plt.legend()
        plt.savefig(f'LOF_images/LOF_{j}', format='png', dpi=1200)
        plt.show()
        outliers_list.append(list(outliers.index))
    return outliers_list
Exemplo n.º 15
0
def localoutlierfactor(data, predict, k):
    from sklearn.neighbors import LocalOutlierFactor
    clf = LocalOutlierFactor(n_neighbors=k + 1,
                             algorithm='auto',
                             contamination=0.1,
                             n_jobs=-1)
    clf.fit(data)
    # 记录 k 邻域距离
    predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1)
    # 记录 LOF 离群因子,做相反数处理
    predict['local outlier factor'] = -clf._decision_function(
        predict.iloc[:, :-1])
    return predict
Exemplo n.º 16
0
def main():
    print("loading model")
    model_t = keras.models.load_model('model/model_t.h5', compile=False)
    model_r = keras.models.load_model('model/model_r.h5', compile=False)

    ds = DocDataset()

    x_train_snicor, _, _ = ds.load_train_data()
    x_test_snicer, x_test_boot = ds.load_test_data()

    train = model_t.predict(x_train_snicor)
    test_s = model_t.predict(x_test_snicer)
    test_b = model_t.predict(x_test_boot)

    train = train.reshape((len(x_train_snicor), -1))
    test_s = test_s.reshape((len(x_test_snicer), -1))
    test_b = test_b.reshape((len(x_test_boot), -1))

    #0-1に変換
    ms = MinMaxScaler()
    train = ms.fit_transform(train)
    test_s = ms.transform(test_s)
    test_b = ms.transform(test_b)

    # fit the model
    clf = LocalOutlierFactor(n_neighbors=5)
    y_pred = clf.fit(train)

    # 異常スコア
    Z1 = -clf._decision_function(test_s)
    Z2 = -clf._decision_function(test_b)

    #ROC曲線の描画
    y_true = np.zeros(len(test_s) + len(test_b))
    y_true[len(test_s):] = 1  #0:正常、1:異常

    # FPR, TPR(, しきい値) を算出
    fpr, tpr, _ = metrics.roc_curve(y_true, np.hstack((Z1, Z2)))

    # AUC
    auc = metrics.auc(fpr, tpr)

    # ROC曲線をプロット
    plt.plot(fpr, tpr, label='DeepOneClassification(AUC = %.2f)' % auc)
    plt.legend()
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid(True)
    plt.show()
def localOutlierFactor(data, predict, k):

    # LOF
    clf = LocalOutlierFactor(n_neighbors=k + 1,
                             algorithm='auto',
                             contamination=0.1,
                             n_jobs=-1)
    clf.fit(data)

    # Computer k-neatest-point distance
    predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1)

    # Record LOF,process negative values
    predict['local outlier factor'] = -clf._decision_function(
        predict.iloc[:, :-1])
    return predict
    def perform_outlier_detection_all_combos(self, X):
        # LOF on all features
        scores = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}}
        print "Starting anomaly detection loop"
        for key, value in X.iteritems():
            '''
            if key == 'user':
                continue
                '''
            print key
            '''
            clf = IsolationForest()
            clf.fit(value)
            scores[key]['iforest'] = clf.decision_function(value)
            print "Finished iforest"
            '''
            clf = LocalOutlierFactor(n_neighbors=20)
            clf.fit(value)
            scores[key]['lof'] = clf._decision_function(value)
            '''
            clf = DBOD()
            clf.fit(value)
            scores[key]['dbod'] = clf.decision_function_distance(value)

            #scores[key]['abod'] = ABOD(X, self.seed_user)
            scores[key]['abod'] = clf.decision_function_angle(value)
            '''
        print "Finished anomaly detection loop"
        with open(
                'clique_expansion/' + self.seed_user +
                '_unnormalized_scores.csv', 'w') as f:
            for domain, value in scores.iteritems():
                for type_score, all_scores in value.iteritems():
                    f.write(domain + ' ' + type_score + ',')
                    for item in all_scores:
                        f.write(str(item) + ',')
                    f.write('\n')
        combined_scores = self.combine_all(scores)
        scores = None
        new_scores = combined_scores[self.len_priors:]
        user_scores = sorted(zip(self.current_level_users, new_scores),
                             key=lambda x: x[1],
                             reverse=True)
        threshold = np.percentile(new_scores, 8)
        outliers = [u[0] for u in user_scores if u[1] <= threshold]
        return outliers
Exemplo n.º 19
0
def ml():
    import numpy as np
    from sklearn.neighbors import LocalOutlierFactor

    np.random.seed(42)

    # Generate train data
    X = 0.3 * np.random.randn(1000, 2)
    # Generate some abnormal novel observations
    X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
    X = np.r_[X + 2, X - 2, X_outliers]

    # fit the model
    clf = LocalOutlierFactor(n_neighbors=20)
    y_pred = clf.fit_predict(X)
    y_pred_outliers = y_pred[200:]

    # plot the level sets of the decision function
    size = 500
    xx, yy = np.meshgrid(np.linspace(-5, 5, size), np.linspace(-5, 5, size))
    Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    return Z
Exemplo n.º 20
0
def LOF_Score(trains, test_anomaly, test_normal):
    train_a = D.predict(trains) # train images
    test_a = D.predict(test_anomaly) # test anomaly
    test_b = D.predict(test_normal) # test normal

    train_a = train_a.reshape((len(trains),-1))
    test_a = test_a.reshape((len(test_anomaly),-1))
    test_b = test_b.reshape((len(test_normal),-1))
    print(train_a.shape, test_a.shape, test_b.shape)

    # MinMaxScaler
    ms = MinMaxScaler()
    train_a = ms.fit_transform(train_a)
    test_a = ms.transform(test_a)
    test_b = ms.transform(test_b)
    clf = LocalOutlierFactor(n_neighbors=5)
    clf.fit(train_a)


    # caliculate anomaly score
    Z1 = -clf._decision_function(test_a)
    Z2 = -clf._decision_function(test_b)
    print('ano score {}, normal score {}'.format(sum(Z1), sum(Z2)))
    return train_a, test_a, test_b
Exemplo n.º 21
0
test_s = test_s.reshape((len(X_test_s), -1))
print('reshape test normal', test_s.shape)
test_b = test_b.reshape((len(X_test_b), -1))
print('reshape test abnormal', test_b.shape)

print('fit model')
ms = MinMaxScaler()
train = ms.fit_transform(train)
test_s = ms.transform(test_s)
test_b = ms.transform(test_b)

# fit the model
clf = LocalOutlierFactor(n_neighbors=5)
y_pred = clf.fit(train)

Z1 = -clf._decision_function(test_s)
Z2 = -clf._decision_function(test_b)

#ROC
y_true = np.zeros(len(test_s) + len(test_b))
y_true[len(test_s):] = 1
path = x_test_s_path + x_test_b_path

# precision, recall, f1 = caculate_acc(y_true, np.hstack((Z1,Z2)),path)

fpr, tpr, _ = metrics.roc_curve(y_true, np.hstack((Z1, Z2)))

# AUC
auc = metrics.auc(fpr, tpr)
print('auc', auc)
Exemplo n.º 22
0
class LOF(BaseDetector):
    """Wrapper of scikit-learn LOF Class with more functionalities.
    Unsupervised Outlier Detection using Local Outlier Factor (LOF).

    The anomaly score of each sample is called Local Outlier Factor.
    It measures the local deviation of density of a given sample with
    respect to its neighbors.
    It is local in that the anomaly score depends on how isolated the object
    is with respect to the surrounding neighborhood.
    More precisely, locality is given by k-nearest neighbors, whose distance
    is used to estimate the local density.
    By comparing the local density of a sample to the local densities of
    its neighbors, one can identify samples that have a substantially lower
    density than their neighbors. These are considered outliers.
    See :cite:`breunig2000lof` for details.

    Parameters
    ----------
    n_neighbors : int, optional (default=20)
        Number of neighbors to use by default for `kneighbors` queries.
        If n_neighbors is larger than the number of samples provided,
        all samples will be used.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors:

        - 'ball_tree' will use BallTree
        - 'kd_tree' will use KDTree
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.

        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.

    leaf_size : int, optional (default=30)
        Leaf size passed to `BallTree` or `KDTree`. This can
        affect the speed of the construction and query, as well as the memory
        required to store the tree. The optimal value depends on the
        nature of the problem.

    metric : string or callable, default 'minkowski'
        metric used for the distance computation. Any metric from scikit-learn
        or scipy.spatial.distance can be used.

        If 'precomputed', the training input X is expected to be a distance
        matrix.

        If metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays as input and return one value indicating the
        distance between them. This works for Scipy's metrics, but is less
        efficient than passing the metric name as a string.

        Valid values for metric are:

        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
          'manhattan']

        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
          'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
          'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
          'sqeuclidean', 'yule']

        See the documentation for scipy.spatial.distance for details on these
        metrics:
        http://docs.scipy.org/doc/scipy/reference/spatial.distance.html

    p : integer, optional (default = 2)
        Parameter for the Minkowski metric from
        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
        equivalent to using manhattan_distance (l1), and euclidean_distance
        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
        See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances

    metric_params : dict, optional (default = None)
        Additional keyword arguments for the metric function.

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. When fitting this is used to define the
        threshold on the decision function.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.
        Affects only kneighbors and kneighbors_graph methods.

    Attributes
    ----------
    n_neighbors_ : int
        The actual number of neighbors used for `kneighbors` queries.

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """
    def __init__(self,
                 n_neighbors=20,
                 algorithm='auto',
                 leaf_size=30,
                 metric='minkowski',
                 p=2,
                 metric_params=None,
                 contamination=0.1,
                 n_jobs=1):
        super(LOF, self).__init__(contamination=contamination)
        self.n_neighbors = n_neighbors
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.metric = metric
        self.p = p
        self.metric_params = metric_params
        self.n_jobs = n_jobs

    # noinspection PyIncorrectDocstring
    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors,
                                            algorithm=self.algorithm,
                                            leaf_size=self.leaf_size,
                                            metric=self.metric,
                                            p=self.p,
                                            metric_params=self.metric_params,
                                            contamination=self.contamination,
                                            n_jobs=self.n_jobs)
        self.detector_.fit(X=X, y=y)

        # Invert decision_scores_. Outliers comes with higher outlier scores
        self.decision_scores_ = invert_order(
            self.detector_.negative_outlier_factor_)
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """

        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])

        # Invert outlier scores. Outliers comes with higher outlier scores
        # noinspection PyProtectedMember
        if _get_sklearn_version() > 19:
            return invert_order(self.detector_._score_samples(X))
        else:
            return invert_order(self.detector_._decision_function(X))

    @property
    def n_neighbors_(self):
        """The actual number of neighbors used for kneighbors queries.
        Decorator for scikit-learn LOF attributes.
        """
        return self.detector_.n_neighbors_
Exemplo n.º 23
0
# Generate train data
X = 0.3 * np.random.randn(100, 2)
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X + 2, X - 2, X_outliers]


# fit the model
clf = LocalOutlierFactor(n_neighbors=20)
y_pred = clf.fit_predict(X)
y_pred_outliers = y_pred[200:]


# plot the level sets of the decision function
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)



plt.title("Local Outlier Factor (LOF)")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

a = plt.scatter(X[:200, 0], X[:200, 1], c='white',
                edgecolor='k', s=20)
b = plt.scatter(X[200:, 0], X[200:, 1], c='red',
                edgecolor='k', s=20)
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([a, b],
Exemplo n.º 24
0
class LOF(BaseDetector):
    """Wrapper of scikit-learn LOF Class with more functionalities.
    Unsupervised Outlier Detection using Local Outlier Factor (LOF).

    The anomaly score of each sample is called Local Outlier Factor.
    It measures the local deviation of density of a given sample with
    respect to its neighbors.
    It is local in that the anomaly score depends on how isolated the object
    is with respect to the surrounding neighborhood.
    More precisely, locality is given by k-nearest neighbors, whose distance
    is used to estimate the local density.
    By comparing the local density of a sample to the local densities of
    its neighbors, one can identify samples that have a substantially lower
    density than their neighbors. These are considered outliers.
    See :cite:`breunig2000lof` for details.

    Parameters
    ----------
    n_neighbors : int, optional (default=20)
        Number of neighbors to use by default for `kneighbors` queries.
        If n_neighbors is larger than the number of samples provided,
        all samples will be used.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors:

        - 'ball_tree' will use BallTree
        - 'kd_tree' will use KDTree
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.

        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.

    leaf_size : int, optional (default=30)
        Leaf size passed to `BallTree` or `KDTree`. This can
        affect the speed of the construction and query, as well as the memory
        required to store the tree. The optimal value depends on the
        nature of the problem.

    metric : string or callable, default 'minkowski'
        metric used for the distance computation. Any metric from scikit-learn
        or scipy.spatial.distance can be used.

        If 'precomputed', the training input X is expected to be a distance
        matrix.

        If metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays as input and return one value indicating the
        distance between them. This works for Scipy's metrics, but is less
        efficient than passing the metric name as a string.

        Valid values for metric are:

        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
          'manhattan']

        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
          'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
          'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
          'sqeuclidean', 'yule']

        See the documentation for scipy.spatial.distance for details on these
        metrics:
        http://docs.scipy.org/doc/scipy/reference/spatial.distance.html

    p : integer, optional (default = 2)
        Parameter for the Minkowski metric from
        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
        equivalent to using manhattan_distance (l1), and euclidean_distance
        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
        See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances

    metric_params : dict, optional (default = None)
        Additional keyword arguments for the metric function.

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. When fitting this is used to define the
        threshold on the decision function.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.
        Affects only kneighbors and kneighbors_graph methods.

    Attributes
    ----------
    n_neighbors_ : int
        The actual number of neighbors used for `kneighbors` queries.

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30,
                 metric='minkowski', p=2, metric_params=None,
                 contamination=0.1, n_jobs=1):
        super(LOF, self).__init__(contamination=contamination)
        self.n_neighbors = n_neighbors
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.metric = metric
        self.p = p
        self.metric_params = metric_params
        self.n_jobs = n_jobs

    # noinspection PyIncorrectDocstring
    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors,
                                            algorithm=self.algorithm,
                                            leaf_size=self.leaf_size,
                                            metric=self.metric,
                                            p=self.p,
                                            metric_params=self.metric_params,
                                            contamination=self.contamination,
                                            n_jobs=self.n_jobs)
        self.detector_.fit(X=X, y=y)

        # Invert decision_scores_. Outliers comes with higher outlier scores
        self.decision_scores_ = invert_order(
            self.detector_.negative_outlier_factor_)
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """

        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])

        # Invert outlier scores. Outliers comes with higher outlier scores
        # noinspection PyProtectedMember
        if _sklearn_version_20():
            return invert_order(self.detector_._score_samples(X))
        else:
            return invert_order(self.detector_._decision_function(X))

    @property
    def n_neighbors_(self):
        """The actual number of neighbors used for kneighbors queries.
        Decorator for scikit-learn LOF attributes.
        """
        return self.detector_.n_neighbors_
Exemplo n.º 25
0
np.random.seed(42)

# Generate train data
X = 0.3 * np.random.randn(100, 2)
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X + 2, X - 2, X_outliers]

# fit the model
clf = LocalOutlierFactor(n_neighbors=20)
y_pred = clf.fit_predict(X)
y_pred_outliers = y_pred[200:]

# plot the level sets of the decision function
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Local Outlier Factor (LOF)")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

a = plt.scatter(X[:200, 0], X[:200, 1], c='white')
b = plt.scatter(X[200:, 0], X[200:, 1], c='red')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([a, b],
           ["normal observations",
            "abnormal observations"],
           loc="upper left")
plt.show()
Exemplo n.º 26
0
    if i in [1, 2, 3, 8, 9, 10]:
        labels_test[index] = 1

labels_val = np.zeros(y_val.shape[0])
for index, i in enumerate(y_val):
    if i in [1, 2, 3, 8, 9, 10]:
        labels_val[index] = 1

best_clf = None
best_auc = 0
best_params = None
for n_neighbors_ in range(10,51):
    for contamination_ in np.arange(0.01,0.11,0.01):
        clf = LocalOutlierFactor(n_neighbors=n_neighbors_, contamination=contamination_)
        clf.fit(X_train)
        val_score = clf._decision_function(X_val)
        x, y, threshold = roc_curve(labels_val, -val_score)
        a = auc(x, y)
        print('n_neighbor:%s, contamination:%s, auc:%s'%(n_neighbors_, contamination_, a))
        if a > best_auc:
            best_auc = a
            best_clf = clf
            best_params = (n_neighbors_, contamination_)

test_score = best_clf._decision_function(X_test)
x, y, _ = roc_curve(labels_test, -test_score)
test_auc = auc(x, y)
print('best in val data : auc:%s, n_neighbors:%s, contamination:%s'%(best_auc, best_params[0],
                                                                     best_params[1]))
plt.plot(x, y, c='red', label='%s auc' % test_auc)
plt.plot([0, 1], [0, 1], c='navy', linestyle='--')
class LOF(BaseDetector):
    """ Local outlier factor (LOF).

    Parameters
    ----------
    k : int (default=10)
        Number of nearest neighbors.

    contamination : float (default=0.1)
        Estimate of the expected percentage of anomalies in the data.

    metric : string (default=euclidean)
        Distance metric for the distance computation.

    Comments
    --------
    - This method DOES NOT EASILY extend to OUT-OF-SAMPLE setting!
    - The number of neighbors cannot be larger than the number of instances in
    the data: automatically correct if necessary.
    """
    def __init__(self,
                 k=10,
                 contamination=0.1,
                 metric='euclidean',
                 tol=1e-8,
                 verbose=False):
        super(LOF, self).__init__()

        self.k = int(k)
        self.contamination = float(contamination)
        self.metric = str(metric)

        self.tol = float(tol)
        self.verbose = bool(verbose)

    def fit_predict(self, X, y=None):
        """ Fit the model to the training set X and returns the anomaly score
            of the instances in X.

        :param X : np.array(), shape (n_samples, n_features)
            The samples to compute anomaly score w.r.t. the training samples.
        :param y : np.array(), shape (n_samples), default = None
            Labels for examples in X.

        :returns y_score : np.array(), shape (n_samples)
            Anomaly score for the examples in X.
        :returns y_pred : np.array(), shape (n_samples)
            Returns -1 for inliers and +1 for anomalies/outliers.
        """

        X, y = check_X_y(X, y)

        return self.fit(X, y).predict(X)

    def fit(self, X, y=None):
        """ Fit the model using data in X.

        :param X : np.array(), shape (n_samples, n_features)
            The samples to compute anomaly score w.r.t. the training samples.
        :param y : np.array(), shape (n_samples), default = None
            Labels for examples in X.

        :returns self : object
        """

        X, y = check_X_y(X, y)
        n, _ = X.shape

        nn = self._check_valid_number_of_neighbors(n)
        self.clf = LocalOutlierFactor(n_neighbors=nn,
                                      contamination=self.contamination,
                                      metric=self.metric)
        self.clf.fit(X)

        return self

    def predict(self, X):
        """ Compute the anomaly score + predict the label of instances in X.

        :returns y_score : np.array(), shape (n_samples)
            Anomaly score for the examples in X.
        :returns y_pred : np.array(), shape (n_samples)
            Returns -1 for inliers and +1 for anomalies/outliers.
        """

        X, y = check_X_y(X, None)
        n, _ = X.shape

        # predict the anomaly scores
        lof_score = self.clf._decision_function(
            X) * -1  # Shifted opposite of the Local Outlier Factor of X

        # scaled y_score
        y_score = (lof_score - min(lof_score)) / (max(lof_score) -
                                                  min(lof_score))

        # prediction threshold + absolute predictions
        self.threshold = np.sort(y_score)[int(n * (1.0 - self.contamination))]
        y_pred = np.ones(n, dtype=float)
        y_pred[y_score < self.threshold] = -1

        return y_score, y_pred

    def _check_valid_number_of_neighbors(self, n_samples):
        """ Check if the number of nearest neighbors is valid and correct.

        :param n_samples : int
            Number of samples in the data.
        """

        return min(n_samples, self.k)
Exemplo n.º 28
0
def results_point_difficulty(data_original, settings):
    #anom_freq=0.01, n_datasets=10):
    """Generate datasets with different point_difficulties of anomaly class.
       Train, predict and evaluate various models.
       Input:  * data_original: dict with all prepared datasets
               * anom_freq: relative frequency of anomalies (default: 1%)
               * n_datasets: number of datasets to be generated (default: 10)
       Output: * results_point_freq: dict with roc_auc score for each
                 generated dataset
    """
    results_dir = settings['results_dir']
    settings = settings['settings_point_difficulty']
    n_datasets = settings['n_datasets']
    results_point_difficulty_lr = dict()
    results_point_difficulty_gbm = dict()
    results_point_difficulty_iforest = dict()
    results_point_difficulty_lof = dict()
    results_point_difficulty_ae_unsupervised = dict()
    results_point_difficulty_ae_supervised = dict()

    for dataset in data_original.keys():
        print('train on dataset: {}'.format(dataset))
        results_point_difficulty_lr[dataset] = dict()
        results_point_difficulty_gbm[dataset] = dict()
        results_point_difficulty_iforest[dataset] = dict()
        results_point_difficulty_lof[dataset] = dict()
        results_point_difficulty_ae_unsupervised[dataset] = dict()
        results_point_difficulty_ae_supervised[dataset] = dict()
        data_reg = data_original[dataset]['regular']
        anom = data_original[dataset]['anom'].sort_values('point_difficulty')
        num_anom = np.round(settings['anom_freq'] * data_reg.shape[0] / \
                            (1 - settings['anom_freq']))
        step = np.round(anom.shape[0] / (n_datasets + 1))
        for i in range(n_datasets):
            y_pred_lr, y_pred_gbm, y_pred_iforest, y_pred_lof = [], [], [], []
            y_pred_ae_unsupervised, y_pred_ae_supervised, y_true = [], [], []
            #roc_auc_gbm, roc_auc_iforest, roc_auc_lof = [], [], []
            data_anom = anom.iloc[
                int(i * step):int(min(i * step + num_anom, anom.shape[0])), :]
            data_sample = pd.concat([data_reg, data_anom]).sample(frac=1)\
                .reset_index(drop=True)
            X = data_sample.iloc[:, :-2]
            y = data_sample.iloc[:, -2]
            skf = StratifiedKFold(n_splits=3)
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
                y_train, y_test = y[train_index], y[test_index]
                X_train_unsupervised = X_train[y_train == 0]
                y_true.append(y_test)

                # Logistic Regression:
                if settings['models_train']['lr']:
                    lr = LogisticRegression()
                    lr.fit(X_train, y_train)
                    y_pred_lr.append(lr.predict_proba(X_test)[:, 1])

                # GBM:
                if settings['models_train']['gbm']:
                    gbm = GradientBoostingClassifier()
                    gbm.fit(X_train, y_train)
                    y_pred_gbm.append(gbm.predict_proba(X_test)[:, 1])

                # Isolation Forest:
                if settings['models_train']['iforest']:
                    iforest = IsolationForest()
                    iforest.fit(X_train_unsupervised)
                    decision_function = iforest.decision_function(X_test)
                    y_pred_iforest.append(1 - np.interp(decision_function, \
                                                (decision_function.min(),
                                                 decision_function.max()), (0, 1)))

                # Local Outlier Factor (LOF):
                if settings['models_train']['lof']:
                    lof = LocalOutlierFactor()
                    lof.fit(X_train_unsupervised)
                    decision_function = lof._decision_function(X_test)
                    y_pred_lof.append(1 - np.interp(decision_function, \
                                                (decision_function.min(),
                                                 decision_function.max()), (0, 1)))

                # Autoencoder unsupervised
                if settings['models_train']['autoencoder_unsupervised']:
                    input_dim = X_train_unsupervised.shape[1]
                    ae = autoencoder.autoencoder_unsupervised(
                        input_dim=input_dim)
                    ae.fit(X_train_unsupervised,
                           X_train_unsupervised,
                           batch_size=50,
                           epochs=2,
                           verbose=0)
                    X_test_pred = ae.predict(X_test)
                    y_pred_ae_unsupervised.append(autoencoder.\
                                    reconstruction_error(X_test, X_test_pred))

                # Autoencoder supervised
                if settings['models_train']['autoencoder_supervised']:
                    input_dim = X_train.shape[1]
                    ae = autoencoder.autoencoder_supervised(
                        input_dim=input_dim)
                    y_train = pd.concat([X_train, y_train], axis=1)
                    ae.fit(X_train,
                           y_train,
                           batch_size=50,
                           epochs=2,
                           verbose=0)
                    X_test_pred = ae.predict(X_test)
                    y_pred_ae_supervised.append(autoencoder.\
                                    reconstruction_error(X_test, X_test_pred))

            if settings['models_train']['lr']:
                mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc(
                    y_true, y_pred_lr)
                results_point_difficulty_lr[dataset]\
                    [np.round(i / n_datasets, 2)] = (mean_fpr, mean_tpr, mean_auc)
            if settings['models_train']['gbm']:
                mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc(
                    y_true, y_pred_gbm)
                results_point_difficulty_gbm[dataset][np.round(i / n_datasets, 2)] = \
                    (mean_fpr, mean_tpr, mean_auc)
            if settings['models_train']['iforest']:
                mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc(
                    y_true, y_pred_iforest)
                results_point_difficulty_iforest[dataset]\
                    [np.round(i / n_datasets, 2)] = (mean_fpr, mean_tpr, mean_auc)
            if settings['models_train']['lof']:
                mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc(
                    y_true, y_pred_lof)
                results_point_difficulty_lof[dataset][np.round(i / n_datasets, 2)] = \
                    (mean_fpr, mean_tpr, mean_auc)
            if settings['models_train']['autoencoder_unsupervised']:
                mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc(
                    y_true, y_pred_ae_unsupervised)
                results_point_difficulty_ae_unsupervised[dataset]\
                    [np.round(i / n_datasets, 2)] = (mean_fpr, mean_tpr, mean_auc)
            if settings['models_train']['autoencoder_supervised']:
                mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc(
                    y_true, y_pred_ae_supervised)
                results_point_difficulty_ae_supervised[dataset]\
                    [np.round(i / n_datasets, 2)] = (mean_fpr, mean_tpr, mean_auc)

    timestr = time.strftime("%H%M%S")
    if settings['models_train']['lr']:
        name = 'results_point_difficulty_lr_{}'.format(timestr)
        save_results(results_point_difficulty_lr, results_dir, name)
    if settings['models_train']['gbm']:
        name = 'results_point_difficulty_gbm_{}'.format(timestr)
        save_results(results_point_difficulty_gbm, results_dir, name)
    if settings['models_train']['iforest']:
        name = 'results_point_difficulty_iforest_{}'.format(timestr)
        save_results(results_point_difficulty_iforest, results_dir, name)
    if settings['models_train']['lof']:
        name = 'results_point_difficulty_lof_{}'.format(timestr)
        save_results(results_point_difficulty_lof, results_dir, name)
    if settings['models_train']['autoencoder_unsupervised']:
        name = 'results_point_difficulty_ae_unsupervised_{}'.format(timestr)
        save_results(results_point_difficulty_ae_unsupervised, results_dir,
                     name)
    if settings['models_train']['autoencoder_supervised']:
        name = 'results_point_difficulty_ae_supervised_{}'.format(timestr)
        save_results(results_point_difficulty_ae_supervised, results_dir, name)
Exemplo n.º 29
0
def plot(X, y_pred, clf, indexes=[]):
    print("indexes: %s" % indexes)
    pca = PCA(n_components=2)

    scaler = MaxAbsScaler()
    X_scaled = scaler.fit_transform(X)

    def fullprint(*args, **kwargs):
        from pprint import pprint
        import numpy
        opt = numpy.get_printoptions()
        numpy.set_printoptions(threshold='nan')
        pprint(*args, **kwargs)
        numpy.set_printoptions(**opt)

    #print("*"*80)
    #print("*"*80)
    #print("X before transformation:")
    #fullprint(X_scaled)
    #print("*"*80)
    #print("*"*80)
    #print("*"*80)

    X = pca.fit_transform(X_scaled)

    #print("X after transformation:")
    #print(X)
    #print("*"*80)
    #print("*"*80)
    #print("*"*80)

    np.random.seed(42)

    # Generate train data
    #X = 0.3 * np.random.randn(100, 2)
    # Generate some abnormal novel observations
    #X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
    #X = np.r_[X + 2, X - 2, X_outliers]

    # fit the model
    clf = LocalOutlierFactor(n_neighbors=20)
    # re-fitting a new model on the 2d data
    y_pred = clf.fit_predict(X)
    assert (len(X) == len(y_pred))
    zipped = zip(X, y_pred)

    inliers = np.array([i[0] for i in zipped if i[1] == 1])
    outliers = np.array([i[0] for i in zipped if i[1] == -1])
    assert (len(y_pred) == len(inliers) + len(outliers))

    call_outs = []
    if len(indexes) > 0:
        assert (all([len(i) == len(X) for i in indexes]))

        for index in indexes:
            zip_index = zip(X, index)
            call_out = np.array([i[0] for i in zip_index if i[1] == 1])
            call_outs.append(call_out)

    call_outs = np.array(call_outs)
    print("call_outs:")
    print(call_outs)

    # plot the level sets of the decision function
    xx, yy = np.meshgrid(np.linspace(-1, 1, 50), np.linspace(-1, 1, 50))
    Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    print("*" * 80)
    print("Z:")
    print(Z.shape)
    print(Z)
    print("*" * 80)

    plt.title("Local Outlier Factor (LOF) for AIS-Scenario16")
    plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

    if len(call_outs) > 0:
        legend = []
        for call_out, color in zip(call_outs, ["green", "purple", "orange"]):
            c = plt.scatter(
                call_out[:, 0],
                call_out[:, 1],
                c=color,
                #                            marker=".", alpha=0.1,
                s=20)
            legend.append(c)
        plt.legend(legend,
                   ["removes r = x/ y", "removes long switch statement"],
                   loc="upper left")
    else:
        a = plt.scatter(inliers[:, 0],
                        inliers[:, 1],
                        c='white',
                        edgecolor='k',
                        s=20)
        b = plt.scatter(outliers[:, 0],
                        outliers[:, 1],
                        c='red',
                        edgecolor='k',
                        s=20)
        plt.legend([a, b], ["typical behavior", "outliers"], loc="upper left")

    plt.axis('tight')
    max_x = max([i[0] for i in X])
    min_x = min([i[0] for i in X])
    max_y = max([i[1] for i in X])
    min_y = min([i[1] for i in X])
    plt.autoscale()
    #plt.xlim((min_x - 0.01, max_x + 0.01))
    #plt.ylim((min_y - 0.01, max_y + 0.01))
    #plt.xlim((-.2,.1))
    #plt.ylim((-.2,.2))

    plt.savefig("visualization1.png")
    plt.show()
Exemplo n.º 30
0
def predict(x_train_s, x_test_s, x_test_b, model) -> None:
    train = model.predict(x_train_s)
    test_s = model.predict(x_test_s)
    test_b = model.predict(x_test_b)

    train = train.reshape((len(x_train_s), -1))
    test_s = test_s.reshape((len(x_test_s), -1))
    test_b = test_b.reshape((len(x_test_b), -1))

    ms = MinMaxScaler()
    train = ms.fit_transform(train)
    test_s = ms.transform(test_s)
    test_b = ms.transform(test_b)

    clf = LocalOutlierFactor(n_neighbors=5)
    _ = clf.fit(train)

    z1 = -clf._decision_function(test_s)
    z2 = -clf._decision_function(test_b)

    TOP_K = 5
    unsorted_max_indeces = np.argpartition(-z1, TOP_K)[:TOP_K]
    y = z1[unsorted_max_indeces]
    indices = np.argsort(-y)
    max_k_indices = unsorted_max_indeces[indices]
    plt.figure()
    for count, i in enumerate(max_k_indices):
        plt.subplot(1, TOP_K, count + 1)
        plt.imshow(x_test_s[i])
        plt.title(f"index: {i}\n{z1[i]:.3e}")
        plt.tick_params(labelbottom=False,
                        labelleft=False,
                        labelright=False,
                        labeltop=False)
        plt.tick_params(bottom=False, left=False, right=False, top=False)
    plt.show()
    plt.savefig("_data/x_test_s_top_k.png")

    unsorted_max_indeces = np.argpartition(-z2, TOP_K)[:TOP_K]
    y = z2[unsorted_max_indeces]
    indices = np.argsort(-y)
    max_k_indices = unsorted_max_indeces[indices]
    plt.figure()
    for count, i in enumerate(max_k_indices):
        plt.subplot(1, TOP_K, count + 1)
        plt.imshow(x_test_b[i])
        plt.title(f"index: {i}\n{z2[i]:.3e}")
        plt.tick_params(labelbottom=False,
                        labelleft=False,
                        labelright=False,
                        labeltop=False)
        plt.tick_params(bottom=False, left=False, right=False, top=False)
    plt.show()
    plt.savefig("_data/x_test_b_top_k.png")

    unsorted_max_indeces = np.argpartition(z1, TOP_K)[:TOP_K]
    y = z1[unsorted_max_indeces]
    indices = np.argsort(y)
    max_k_indices = unsorted_max_indeces[indices]
    plt.figure()
    for count, i in enumerate(max_k_indices):
        plt.subplot(1, TOP_K, count + 1)
        plt.imshow(x_test_s[i])
        plt.title(f"index: {i}\n{z1[i]:.3e}")
        plt.tick_params(labelbottom=False,
                        labelleft=False,
                        labelright=False,
                        labeltop=False)
        plt.tick_params(bottom=False, left=False, right=False, top=False)
    plt.show()
    plt.savefig("_data/x_test_s_under_k.png")

    unsorted_max_indeces = np.argpartition(z2, TOP_K)[:TOP_K]
    y = z2[unsorted_max_indeces]
    indices = np.argsort(y)
    max_k_indices = unsorted_max_indeces[indices]
    plt.figure()
    for count, i in enumerate(max_k_indices):
        plt.subplot(1, TOP_K, count + 1)
        plt.imshow(x_test_b[i])
        plt.title(f"index: {i}\n{z2[i]:.3e}")
        plt.tick_params(labelbottom=False,
                        labelleft=False,
                        labelright=False,
                        labeltop=False)
        plt.tick_params(bottom=False, left=False, right=False, top=False)
    plt.show()
    plt.savefig("_data/x_test_b_under_k.png")

    y_true = np.zeros(len(test_s) + len(test_b))
    y_true[len(test_s):] = 1  # normal = 0, abnormal = 1

    fpr, tpr, _ = metrics.roc_curve(y_true, np.hstack((z1, z2)))
    auc = metrics.auc(fpr, tpr)

    plt.plot(fpr, tpr, label=f"DOC(AUC = {auc}")
    plt.legend()
    plt.title("ROC curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.grid(True)
    plt.show()
    plt.savefig("_data/roc_curve.png")
Exemplo n.º 31
0
def main(camera_FPS, camera_width, camera_height, inference_scale, threshold,
         device):

    path = "pictures/"
    if not os.path.exists(path):
        os.mkdir(path)

    model_path = "OneClassAnomalyDetection-RaspberryPi3/DOC/model/"
    if os.path.exists(model_path):
        # LOF
        print("LOF model building...")
        x_train = np.loadtxt(model_path + "train.csv", delimiter=",")

        ms = MinMaxScaler()
        x_train = ms.fit_transform(x_train)

        # fit the LOF model
        clf = LocalOutlierFactor(n_neighbors=5)
        clf.fit(x_train)

        # DOC
        print("DOC Model loading...")
        if device == "MYRIAD":
            model_xml = "irmodels/tensorflow/FP16/weights.xml"
            model_bin = "irmodels/tensorflow/FP16/weights.bin"
        else:
            model_xml = "irmodels/tensorflow/FP32/weights.xml"
            model_bin = "irmodels/tensorflow/FP32/weights.bin"
        net = IENetwork(model=model_xml, weights=model_bin)
        plugin = IEPlugin(device=device)
        if device == "CPU":
            if platform.processor() == "x86_64":
                plugin.add_cpu_extension("lib/x86_64/libcpu_extension.so")
        exec_net = plugin.load(network=net)
        input_blob = next(iter(net.inputs))
        print("loading finish")
    else:
        print("Nothing model folder")
        sys.exit(0)

    base_range = min(camera_width, camera_height)
    stretch_ratio = inference_scale / base_range
    resize_image_width = int(camera_width * stretch_ratio)
    resize_image_height = int(camera_height * stretch_ratio)

    if base_range == camera_height:
        crop_start_x = (resize_image_width - inference_scale) // 2
        crop_start_y = 0
    else:
        crop_start_x = 0
        crop_start_y = (resize_image_height - inference_scale) // 2
    crop_end_x = crop_start_x + inference_scale
    crop_end_y = crop_start_y + inference_scale

    fps = ""
    message = "Push [p] to take a picture"
    result = "Push [s] to start anomaly detection"
    flag_score = False
    picture_num = 1
    elapsedTime = 0
    score = 0
    score_mean = np.zeros(10)
    mean_NO = 0

    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FPS, camera_FPS)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)

    time.sleep(1)

    while cap.isOpened():
        t1 = time.time()

        ret, image = cap.read()

        if not ret:
            break

        image_copy = image.copy()

        # prediction
        if flag_score == True:
            prepimg = cv2.resize(image,
                                 (resize_image_width, resize_image_height))
            prepimg = prepimg[crop_start_y:crop_end_y, crop_start_x:crop_end_x]
            prepimg = np.array(prepimg).reshape(
                (1, inference_scale, inference_scale, 3))
            prepimg = prepimg / 255
            prepimg = prepimg.transpose((0, 3, 1, 2))

            exec_net.start_async(request_id=0, inputs={input_blob: prepimg})
            exec_net.requests[0].wait(-1)
            outputs = exec_net.requests[0].outputs["Reshape_"]
            outputs = outputs.reshape((len(outputs), -1))
            outputs = ms.transform(outputs)
            score = -clf._decision_function(outputs)

        # output score
        if flag_score == False:
            cv2.putText(image, result, (camera_width - 350, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1,
                        cv2.LINE_AA)
        else:
            score_mean[mean_NO] = score[0]
            mean_NO += 1
            if mean_NO == len(score_mean):
                mean_NO = 0

            if np.mean(score_mean) > threshold:  #red if score is big
                cv2.putText(image, "{:.1f} Score".format(np.mean(score_mean)),
                            (camera_width - 230, 100),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1,
                            cv2.LINE_AA)
            else:  # blue if score is small
                cv2.putText(image, "{:.1f} Score".format(np.mean(score_mean)),
                            (camera_width - 230, 100),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1,
                            cv2.LINE_AA)

        # message
        cv2.putText(image, message, (camera_width - 285, 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
        cv2.putText(image, fps, (camera_width - 164, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 1, cv2.LINE_AA)

        cv2.imshow("Result", image)

        # FPS
        elapsedTime = time.time() - t1
        fps = "{:.0f} FPS".format(1 / elapsedTime)

        # quit or calculate score or take a picture
        key = cv2.waitKey(1) & 0xFF
        if key == ord("q"):
            break
        if key == ord("p"):
            cv2.imwrite(path + str(picture_num) + ".jpg", image_copy)
            picture_num += 1
        if key == ord("s"):
            flag_score = True

    cv2.destroyAllWindows()
Exemplo n.º 32
0
def analyze(data):
    # Convert this to python data for us to be able to run ML algorithms
    json_to_python = json.loads(data)

    # Data pre-processing here:
    per_size = dict()  # IP-Response size
    hostlist = dict()

    for y in json_to_python:

        hostlist[y['HOST']] = 1

        if y['HOST'] in per_size:

            per_size[y['HOST']].append(int(y['SIZE']))

        else:

            per_size[y['HOST']] = [int(y['SIZE'])]

    log.debug(
        "*** Printing Input to analysis - 4 (1): K-means on IP and average response size ****"
    )

    #####*****SIZE******####
    #### Analysis #4 (1): IP address - Size of response received feature
    X = np.array([[0.00, 0.00]])
    for x in hostlist:

        avg_size = mean(per_size[x])
        log.debug(x + ": " + str(avg_size))
        y = x.split(".")
        ip = ""
        for z in range(4):
            l = len(y[z])
            l = 3 - l
            if (l > 0):
                zero = ""
                for t in range(3 - len(y[z])):
                    zero = zero + "0"
                y[z] = zero + y[z]

            ip = ip + y[z]

        # log.debug( str(float(float(ip)/1000)) + ": " + str(avg_size))
        le = [float(float(ip) / 1000), avg_size]

        X = np.vstack([X, le])

    log.info(
        "********   Analysis #4 (3) :  IP-Address and Response Size received: LocalOutlierFactor  ********"
    )
    # print kmeans.labels_
    log.info(
        "******** Please check the image test-save-outlier-LOF.png saved in your working directory for more info. ********"
    )

    ######################################################
    ##Analysis : 4 (3): Outlier detection:
    np.random.seed(42)

    # fit the model
    clf = LocalOutlierFactor(n_neighbors=20)

    y_pred = clf.fit_predict(X)
    y_pred_outliers = y_pred[200:]

    # plot the level sets of the decision function
    xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
    Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.title("Local Outlier Factor (LOF)")
    plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

    a = plt.scatter(X[:200, 0], X[:200, 1], c='white', edgecolor='k', s=20)
    b = plt.scatter(X[200:, 0], X[200:, 1], c='red', edgecolor='k', s=20)
    plt.axis('tight')
    # plt.xlim((-5, 5))
    # plt.ylim((-5, 5))
    plt.legend([a, b], ["normal observations", "abnormal observations"],
               loc="upper left")
    ##plt.show()
    plt.savefig('test-save-outlier-LOF.png')