def MinMaxScaler_ano_score(): print("calculate ano score between normal and anomaly") def create_score_data(model, images): pred_images = model.predict(images) # train images score_data = pred_images.reshape((len(pred_images), -1)) return score_data train_normal = create_score_data(metricLR_model, x_normal) test_normal = create_score_data(metricLR_model, x_test_normal) # test anomaly test_ano = create_score_data(metricLR_model, x_ano) # test normal print(train_a.shape, test_a.shape, test_b.shape) # MinMaxScaler ms = MinMaxScaler() train_normal = ms.fit_transform(train_normal) clf = LocalOutlierFactor(n_neighbors=5) clf.fit(train_normal) test_normal = ms.transform(test_normal) test_ano = ms.transform(test_ano) Z1 = -clf._decision_function(test_normal) Z2 = -clf._decision_function(test_ano) print('ano score {}, normal score {}'.format(sum(Z1), sum(Z2)))
class LocalOutlierFactorScore(GraphScore): def __init__(self, beta_matrix, database_name, window_size=None, n_neighbors=40): self._split = beta_matrix.shape[ 0] if window_size is None else window_size self._clf = LocalOutlierFactor(n_neighbors=n_neighbors, contamination='auto') super(LocalOutlierFactorScore, self).__init__(beta_matrix, database_name) def _calc_score(self): num_graphs, num_ftr = self._beta_matrix.shape interval = self._split self._clf.fit(self._beta_matrix[:interval - 1]) for graph_k in range(num_graphs): if graph_k >= interval: from_graph = graph_k - interval to_graph = graph_k self._clf.fit(self._beta_matrix[from_graph:to_graph]) self._scores[graph_k] = self._clf._decision_function( [self._beta_matrix[graph_k]])[0]
class LocalOutlierFactorAD(ADModel): def __init__(self): super().__init__() self.clf = None self.scaler = None # Model Hyperparams thresholds = np.arange(-0.5, 0.5, 0.25) contamination = [0.05] nn = [20] self.params = [(c, n, t) for n in nn for c in contamination for t in thresholds] self.contamination = None self.n_neighbors = None self.threshold = None def config(self, hyperparam_tuple): contamination, nearest_n, thresh = hyperparam_tuple self.contamination = contamination self.n_neighbors = nearest_n self.threshold = thresh def train(self, X, y, verbose = False): # Scale features self.scaler = StandardScaler() self.scaler.fit(X) self.clf = LocalOutlierFactor(contamination = self.contamination, n_neighbors = self.n_neighbors) X_scaled = self.scaler.transform(X) self.clf.fit(X_scaled) def predict(self, X, **kwargs): preds = self.clf._decision_function(self.scaler.transform(X)) print(preds) preds = (preds < self.threshold).astype(np.int32).reshape(-1, 1) print(preds) return preds
def perform_outlier_detection(self, X, len_priors): # LOF on all features clf = LocalOutlierFactor(n_neighbors=20) clf.fit(X) check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"]) if X is not None: X = check_array(X, accept_sparse='csr') y_pred = clf._decision_function(X) else: y_pred = clf.negative_outlier_factor_ #lof_scores = y_pred[len_priors:] #lof_scores = zip(self.current_level_users, y_pred_new) lof_scores = y_pred # Isolation forest on all features clf = IsolationForest() clf.fit(X) y_pred = clf.decision_function(X) #forest_scores = y_pred[len_priors:] #forest_scores = zip(self.current_level_users, y_pred_new) forest_scores = y_pred scores = self.combine(lof_scores, forest_scores) new_scores = scores[len_priors:] user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True) threshold = np.percentile(new_scores, 95) outliers = [u[0] for u in user_scores if u[1] >= threshold] return outliers
def perform_outlier_detection(self, X): # LOF on all features clf = LocalOutlierFactor(n_neighbors=20) clf.fit(X) lof_scores = clf._decision_function(X) lof_scores = clf._decision_function(X) # Isolation forest on all features clf = IsolationForest() clf.fit(X) forest_scores = clf.decision_function(X) ''' clf = DBOD() clf.fit(X) distance_scores = clf.decision_function_distance(X) #abod_scores = ABOD(X, self.seed_user) abod_scores = clf.decision_function_angle(X) scores = self.combine([lof_scores, forest_scores, distance_scores, abod_scores]) ''' # scores = forest_scores scores = self.combine([lof_scores, forest_scores]) ''' with open('clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f: for score in scores: f.write(str(score) + '\n') ''' new_scores = scores[self.len_priors:] user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True) threshold = np.percentile(new_scores, 8) outliers = [u[0] for u in user_scores if u[1] <= threshold] return outliers
def run_samples(): small_data = read_matlab_data('data/server_latency_throughput.mat') X = small_data.get('X') model = LocalOutlierFactor(n_neighbors=20) y_predict = model.fit_predict(X) outliers = np.where(y_predict == -1) inliers = np.where(y_predict == 1) print("Number of inliers= ", inliers[0].size) print("Number of outliers= ", outliers[0].size) n = np.arange(0, 35.5, 0.5) xx, yy = np.meshgrid(n, n) Z = model._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Local Outlier Factor (LOF)") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) in_data = plt.scatter(X[inliers, 0], X[inliers, 1], c='white', edgecolor='k', s=20, label='inliers') out_data = plt.scatter(X[outliers, 0], X[outliers, 1], c='red', edgecolor='k', s=20, label='outliers') plt.legend(handles=[in_data, out_data], loc="upper left") plt.show()
class LOF: def fit(self, X): self._lof = LocalOutlierFactor(n_neighbors=16, n_jobs=-1).fit(X[:4096]) return self def anomaly_scores(self, X): return -self._lof._decision_function(X)
def ano_detect(flow, err, stfeature, label): """ """ # FAL points = np.concatenate([stfeature, err], axis=1) detector = LocalOutlierFactor(n_neighbors=100) detector.fit(points) ano_scores = - detector._decision_function(points) compute_metrics(ano_scores, label, "FAL") # LOF points = flow detector = LocalOutlierFactor(n_neighbors=100) detector.fit(points) ano_scores = - detector._decision_function(points) compute_metrics(ano_scores, label, "LOF")
def LOF_score(S): X = np.array(S) clf = LocalOutlierFactor() clf.fit(X) factores = clf._decision_function(X) for i in range(len(factores)): factores[i] = -1 * factores[i] return factores
def regulof(X): from sklearn.neighbors import LocalOutlierFactor clf = LocalOutlierFactor(n_neighbors=50) y = clf.fit_predict(X) xx, yy = np.meshgrid(np.linspace(-1, 1, 500), np.linspace(-1, 1, 500)) Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return y
def LOF(data, predict, k): clf = LocalOutlierFactor(n_neighbors=k+1, algorithm='auto', contamination=0.1,n_jobs=-1) clf.fit(data) predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1) predict['local outlier factor'] = -clf._decision_function(predict.iloc[:,:-1]) return predict
class LOF(AbstractDetector): name = "LOF" data_type = "REAL" def compute_scores(self, dataframe: pd.DataFrame, classes: np.array): bin_dataframe = dataframe._binarize_categorical_values() self.clf = LocalOutlierFactor(**self.settings) self.clf.fit(bin_dataframe.values) self.values = self.clf._decision_function(bin_dataframe.values) return self
def localoutlierfactor(data, predict, k): lof_clf = LocalOutlierFactor(n_neighbors=k + 1, contamination=0.2, n_jobs=-1) lof_clf.fit(data) # 记录 k 邻域距离 predict['k_distances'] = lof_clf.kneighbors(predict)[0].max(axis=1) # 记录 LOF 离群因子,做相反数处理 predict['local_outlier_factor'] = -lof_clf._decision_function( predict.iloc[:, :-1]) return predict
def LOF_anomaly_score(x): """To figure out anomaly scores.""" # must calibrate it for all measurements outliers = [] outliers_list = [] for i, j in x: pd_i = pd.DataFrame(i) method = 1 k = 30 clf = LocalOutlierFactor(n_neighbors=k, algorithm='auto', contamination=0.1, n_jobs=-1) clf.fit(pd_i) # Record k neighborhood distance pd_i['k distances'] = clf.kneighbors(pd_i)[0].max(axis=1) # Record LOF factor,take negative pd_i['local outlier factor'] = -clf._decision_function( pd_i.iloc[:, :-1]) # Separate group points and normal points according to the threshold outliers = pd_i[pd_i['local outlier factor'] > method].sort_values( by='local outlier factor') inliers = pd_i[pd_i['local outlier factor'] <= method].sort_values( by='local outlier factor') # Figure plt.rcParams['axes.unicode_minus'] = False # display the negative sign plt.figure(figsize=(8, 4)).add_subplot(111) plt.scatter(pd_i[pd_i['local outlier factor'] > method].index, pd_i[pd_i['local outlier factor'] > method] ['local outlier factor'], c='red', s=50, marker='.', alpha=None, label='outliers') plt.scatter(pd_i[pd_i['local outlier factor'] <= method].index, pd_i[pd_i['local outlier factor'] <= method] ['local outlier factor'], c='black', s=50, marker='.', alpha=None, label='inliers') plt.hlines(method, -2, 2 + max(pd_i.index), linestyles='--') plt.xlim(-2, 2 + max(pd_i.index)) plt.title(f'LOF Local outlier detection of {j}', fontsize=13) plt.ylabel('Anamoly Score', fontsize=15) # Local outlier Factors plt.legend() plt.savefig(f'LOF_images/LOF_{j}', format='png', dpi=1200) plt.show() outliers_list.append(list(outliers.index)) return outliers_list
def localoutlierfactor(data, predict, k): from sklearn.neighbors import LocalOutlierFactor clf = LocalOutlierFactor(n_neighbors=k + 1, algorithm='auto', contamination=0.1, n_jobs=-1) clf.fit(data) # 记录 k 邻域距离 predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1) # 记录 LOF 离群因子,做相反数处理 predict['local outlier factor'] = -clf._decision_function( predict.iloc[:, :-1]) return predict
def main(): print("loading model") model_t = keras.models.load_model('model/model_t.h5', compile=False) model_r = keras.models.load_model('model/model_r.h5', compile=False) ds = DocDataset() x_train_snicor, _, _ = ds.load_train_data() x_test_snicer, x_test_boot = ds.load_test_data() train = model_t.predict(x_train_snicor) test_s = model_t.predict(x_test_snicer) test_b = model_t.predict(x_test_boot) train = train.reshape((len(x_train_snicor), -1)) test_s = test_s.reshape((len(x_test_snicer), -1)) test_b = test_b.reshape((len(x_test_boot), -1)) #0-1に変換 ms = MinMaxScaler() train = ms.fit_transform(train) test_s = ms.transform(test_s) test_b = ms.transform(test_b) # fit the model clf = LocalOutlierFactor(n_neighbors=5) y_pred = clf.fit(train) # 異常スコア Z1 = -clf._decision_function(test_s) Z2 = -clf._decision_function(test_b) #ROC曲線の描画 y_true = np.zeros(len(test_s) + len(test_b)) y_true[len(test_s):] = 1 #0:正常、1:異常 # FPR, TPR(, しきい値) を算出 fpr, tpr, _ = metrics.roc_curve(y_true, np.hstack((Z1, Z2))) # AUC auc = metrics.auc(fpr, tpr) # ROC曲線をプロット plt.plot(fpr, tpr, label='DeepOneClassification(AUC = %.2f)' % auc) plt.legend() plt.title('ROC curve') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.grid(True) plt.show()
def localOutlierFactor(data, predict, k): # LOF clf = LocalOutlierFactor(n_neighbors=k + 1, algorithm='auto', contamination=0.1, n_jobs=-1) clf.fit(data) # Computer k-neatest-point distance predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1) # Record LOF,process negative values predict['local outlier factor'] = -clf._decision_function( predict.iloc[:, :-1]) return predict
def perform_outlier_detection_all_combos(self, X): # LOF on all features scores = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}} print "Starting anomaly detection loop" for key, value in X.iteritems(): ''' if key == 'user': continue ''' print key ''' clf = IsolationForest() clf.fit(value) scores[key]['iforest'] = clf.decision_function(value) print "Finished iforest" ''' clf = LocalOutlierFactor(n_neighbors=20) clf.fit(value) scores[key]['lof'] = clf._decision_function(value) ''' clf = DBOD() clf.fit(value) scores[key]['dbod'] = clf.decision_function_distance(value) #scores[key]['abod'] = ABOD(X, self.seed_user) scores[key]['abod'] = clf.decision_function_angle(value) ''' print "Finished anomaly detection loop" with open( 'clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f: for domain, value in scores.iteritems(): for type_score, all_scores in value.iteritems(): f.write(domain + ' ' + type_score + ',') for item in all_scores: f.write(str(item) + ',') f.write('\n') combined_scores = self.combine_all(scores) scores = None new_scores = combined_scores[self.len_priors:] user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True) threshold = np.percentile(new_scores, 8) outliers = [u[0] for u in user_scores if u[1] <= threshold] return outliers
def ml(): import numpy as np from sklearn.neighbors import LocalOutlierFactor np.random.seed(42) # Generate train data X = 0.3 * np.random.randn(1000, 2) # Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X + 2, X - 2, X_outliers] # fit the model clf = LocalOutlierFactor(n_neighbors=20) y_pred = clf.fit_predict(X) y_pred_outliers = y_pred[200:] # plot the level sets of the decision function size = 500 xx, yy = np.meshgrid(np.linspace(-5, 5, size), np.linspace(-5, 5, size)) Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return Z
def LOF_Score(trains, test_anomaly, test_normal): train_a = D.predict(trains) # train images test_a = D.predict(test_anomaly) # test anomaly test_b = D.predict(test_normal) # test normal train_a = train_a.reshape((len(trains),-1)) test_a = test_a.reshape((len(test_anomaly),-1)) test_b = test_b.reshape((len(test_normal),-1)) print(train_a.shape, test_a.shape, test_b.shape) # MinMaxScaler ms = MinMaxScaler() train_a = ms.fit_transform(train_a) test_a = ms.transform(test_a) test_b = ms.transform(test_b) clf = LocalOutlierFactor(n_neighbors=5) clf.fit(train_a) # caliculate anomaly score Z1 = -clf._decision_function(test_a) Z2 = -clf._decision_function(test_b) print('ano score {}, normal score {}'.format(sum(Z1), sum(Z2))) return train_a, test_a, test_b
test_s = test_s.reshape((len(X_test_s), -1)) print('reshape test normal', test_s.shape) test_b = test_b.reshape((len(X_test_b), -1)) print('reshape test abnormal', test_b.shape) print('fit model') ms = MinMaxScaler() train = ms.fit_transform(train) test_s = ms.transform(test_s) test_b = ms.transform(test_b) # fit the model clf = LocalOutlierFactor(n_neighbors=5) y_pred = clf.fit(train) Z1 = -clf._decision_function(test_s) Z2 = -clf._decision_function(test_b) #ROC y_true = np.zeros(len(test_s) + len(test_b)) y_true[len(test_s):] = 1 path = x_test_s_path + x_test_b_path # precision, recall, f1 = caculate_acc(y_true, np.hstack((Z1,Z2)),path) fpr, tpr, _ = metrics.roc_curve(y_true, np.hstack((Z1, Z2))) # AUC auc = metrics.auc(fpr, tpr) print('auc', auc)
class LOF(BaseDetector): """Wrapper of scikit-learn LOF Class with more functionalities. Unsupervised Outlier Detection using Local Outlier Factor (LOF). The anomaly score of each sample is called Local Outlier Factor. It measures the local deviation of density of a given sample with respect to its neighbors. It is local in that the anomaly score depends on how isolated the object is with respect to the surrounding neighborhood. More precisely, locality is given by k-nearest neighbors, whose distance is used to estimate the local density. By comparing the local density of a sample to the local densities of its neighbors, one can identify samples that have a substantially lower density than their neighbors. These are considered outliers. See :cite:`breunig2000lof` for details. Parameters ---------- n_neighbors : int, optional (default=20) Number of neighbors to use by default for `kneighbors` queries. If n_neighbors is larger than the number of samples provided, all samples will be used. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use BallTree - 'kd_tree' will use KDTree - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, optional (default=30) Leaf size passed to `BallTree` or `KDTree`. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. metric : string or callable, default 'minkowski' metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. If 'precomputed', the training input X is expected to be a distance matrix. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics: http://docs.scipy.org/doc/scipy/reference/spatial.distance.html p : integer, optional (default = 2) Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. When fitting this is used to define the threshold on the decision function. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Affects only kneighbors and kneighbors_graph methods. Attributes ---------- n_neighbors_ : int The actual number of neighbors used for `kneighbors` queries. decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination=0.1, n_jobs=1): super(LOF, self).__init__(contamination=contamination) self.n_neighbors = n_neighbors self.algorithm = algorithm self.leaf_size = leaf_size self.metric = metric self.p = p self.metric_params = metric_params self.n_jobs = n_jobs # noinspection PyIncorrectDocstring def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, p=self.p, metric_params=self.metric_params, contamination=self.contamination, n_jobs=self.n_jobs) self.detector_.fit(X=X, y=y) # Invert decision_scores_. Outliers comes with higher outlier scores self.decision_scores_ = invert_order( self.detector_.negative_outlier_factor_) self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # Invert outlier scores. Outliers comes with higher outlier scores # noinspection PyProtectedMember if _get_sklearn_version() > 19: return invert_order(self.detector_._score_samples(X)) else: return invert_order(self.detector_._decision_function(X)) @property def n_neighbors_(self): """The actual number of neighbors used for kneighbors queries. Decorator for scikit-learn LOF attributes. """ return self.detector_.n_neighbors_
# Generate train data X = 0.3 * np.random.randn(100, 2) # Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X + 2, X - 2, X_outliers] # fit the model clf = LocalOutlierFactor(n_neighbors=20) y_pred = clf.fit_predict(X) y_pred_outliers = y_pred[200:] # plot the level sets of the decision function xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Local Outlier Factor (LOF)") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) a = plt.scatter(X[:200, 0], X[:200, 1], c='white', edgecolor='k', s=20) b = plt.scatter(X[200:, 0], X[200:, 1], c='red', edgecolor='k', s=20) plt.axis('tight') plt.xlim((-5, 5)) plt.ylim((-5, 5)) plt.legend([a, b],
class LOF(BaseDetector): """Wrapper of scikit-learn LOF Class with more functionalities. Unsupervised Outlier Detection using Local Outlier Factor (LOF). The anomaly score of each sample is called Local Outlier Factor. It measures the local deviation of density of a given sample with respect to its neighbors. It is local in that the anomaly score depends on how isolated the object is with respect to the surrounding neighborhood. More precisely, locality is given by k-nearest neighbors, whose distance is used to estimate the local density. By comparing the local density of a sample to the local densities of its neighbors, one can identify samples that have a substantially lower density than their neighbors. These are considered outliers. See :cite:`breunig2000lof` for details. Parameters ---------- n_neighbors : int, optional (default=20) Number of neighbors to use by default for `kneighbors` queries. If n_neighbors is larger than the number of samples provided, all samples will be used. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use BallTree - 'kd_tree' will use KDTree - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, optional (default=30) Leaf size passed to `BallTree` or `KDTree`. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. metric : string or callable, default 'minkowski' metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. If 'precomputed', the training input X is expected to be a distance matrix. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics: http://docs.scipy.org/doc/scipy/reference/spatial.distance.html p : integer, optional (default = 2) Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. When fitting this is used to define the threshold on the decision function. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Affects only kneighbors and kneighbors_graph methods. Attributes ---------- n_neighbors_ : int The actual number of neighbors used for `kneighbors` queries. decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination=0.1, n_jobs=1): super(LOF, self).__init__(contamination=contamination) self.n_neighbors = n_neighbors self.algorithm = algorithm self.leaf_size = leaf_size self.metric = metric self.p = p self.metric_params = metric_params self.n_jobs = n_jobs # noinspection PyIncorrectDocstring def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, p=self.p, metric_params=self.metric_params, contamination=self.contamination, n_jobs=self.n_jobs) self.detector_.fit(X=X, y=y) # Invert decision_scores_. Outliers comes with higher outlier scores self.decision_scores_ = invert_order( self.detector_.negative_outlier_factor_) self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # Invert outlier scores. Outliers comes with higher outlier scores # noinspection PyProtectedMember if _sklearn_version_20(): return invert_order(self.detector_._score_samples(X)) else: return invert_order(self.detector_._decision_function(X)) @property def n_neighbors_(self): """The actual number of neighbors used for kneighbors queries. Decorator for scikit-learn LOF attributes. """ return self.detector_.n_neighbors_
np.random.seed(42) # Generate train data X = 0.3 * np.random.randn(100, 2) # Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X + 2, X - 2, X_outliers] # fit the model clf = LocalOutlierFactor(n_neighbors=20) y_pred = clf.fit_predict(X) y_pred_outliers = y_pred[200:] # plot the level sets of the decision function xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Local Outlier Factor (LOF)") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) a = plt.scatter(X[:200, 0], X[:200, 1], c='white') b = plt.scatter(X[200:, 0], X[200:, 1], c='red') plt.axis('tight') plt.xlim((-5, 5)) plt.ylim((-5, 5)) plt.legend([a, b], ["normal observations", "abnormal observations"], loc="upper left") plt.show()
if i in [1, 2, 3, 8, 9, 10]: labels_test[index] = 1 labels_val = np.zeros(y_val.shape[0]) for index, i in enumerate(y_val): if i in [1, 2, 3, 8, 9, 10]: labels_val[index] = 1 best_clf = None best_auc = 0 best_params = None for n_neighbors_ in range(10,51): for contamination_ in np.arange(0.01,0.11,0.01): clf = LocalOutlierFactor(n_neighbors=n_neighbors_, contamination=contamination_) clf.fit(X_train) val_score = clf._decision_function(X_val) x, y, threshold = roc_curve(labels_val, -val_score) a = auc(x, y) print('n_neighbor:%s, contamination:%s, auc:%s'%(n_neighbors_, contamination_, a)) if a > best_auc: best_auc = a best_clf = clf best_params = (n_neighbors_, contamination_) test_score = best_clf._decision_function(X_test) x, y, _ = roc_curve(labels_test, -test_score) test_auc = auc(x, y) print('best in val data : auc:%s, n_neighbors:%s, contamination:%s'%(best_auc, best_params[0], best_params[1])) plt.plot(x, y, c='red', label='%s auc' % test_auc) plt.plot([0, 1], [0, 1], c='navy', linestyle='--')
class LOF(BaseDetector): """ Local outlier factor (LOF). Parameters ---------- k : int (default=10) Number of nearest neighbors. contamination : float (default=0.1) Estimate of the expected percentage of anomalies in the data. metric : string (default=euclidean) Distance metric for the distance computation. Comments -------- - This method DOES NOT EASILY extend to OUT-OF-SAMPLE setting! - The number of neighbors cannot be larger than the number of instances in the data: automatically correct if necessary. """ def __init__(self, k=10, contamination=0.1, metric='euclidean', tol=1e-8, verbose=False): super(LOF, self).__init__() self.k = int(k) self.contamination = float(contamination) self.metric = str(metric) self.tol = float(tol) self.verbose = bool(verbose) def fit_predict(self, X, y=None): """ Fit the model to the training set X and returns the anomaly score of the instances in X. :param X : np.array(), shape (n_samples, n_features) The samples to compute anomaly score w.r.t. the training samples. :param y : np.array(), shape (n_samples), default = None Labels for examples in X. :returns y_score : np.array(), shape (n_samples) Anomaly score for the examples in X. :returns y_pred : np.array(), shape (n_samples) Returns -1 for inliers and +1 for anomalies/outliers. """ X, y = check_X_y(X, y) return self.fit(X, y).predict(X) def fit(self, X, y=None): """ Fit the model using data in X. :param X : np.array(), shape (n_samples, n_features) The samples to compute anomaly score w.r.t. the training samples. :param y : np.array(), shape (n_samples), default = None Labels for examples in X. :returns self : object """ X, y = check_X_y(X, y) n, _ = X.shape nn = self._check_valid_number_of_neighbors(n) self.clf = LocalOutlierFactor(n_neighbors=nn, contamination=self.contamination, metric=self.metric) self.clf.fit(X) return self def predict(self, X): """ Compute the anomaly score + predict the label of instances in X. :returns y_score : np.array(), shape (n_samples) Anomaly score for the examples in X. :returns y_pred : np.array(), shape (n_samples) Returns -1 for inliers and +1 for anomalies/outliers. """ X, y = check_X_y(X, None) n, _ = X.shape # predict the anomaly scores lof_score = self.clf._decision_function( X) * -1 # Shifted opposite of the Local Outlier Factor of X # scaled y_score y_score = (lof_score - min(lof_score)) / (max(lof_score) - min(lof_score)) # prediction threshold + absolute predictions self.threshold = np.sort(y_score)[int(n * (1.0 - self.contamination))] y_pred = np.ones(n, dtype=float) y_pred[y_score < self.threshold] = -1 return y_score, y_pred def _check_valid_number_of_neighbors(self, n_samples): """ Check if the number of nearest neighbors is valid and correct. :param n_samples : int Number of samples in the data. """ return min(n_samples, self.k)
def results_point_difficulty(data_original, settings): #anom_freq=0.01, n_datasets=10): """Generate datasets with different point_difficulties of anomaly class. Train, predict and evaluate various models. Input: * data_original: dict with all prepared datasets * anom_freq: relative frequency of anomalies (default: 1%) * n_datasets: number of datasets to be generated (default: 10) Output: * results_point_freq: dict with roc_auc score for each generated dataset """ results_dir = settings['results_dir'] settings = settings['settings_point_difficulty'] n_datasets = settings['n_datasets'] results_point_difficulty_lr = dict() results_point_difficulty_gbm = dict() results_point_difficulty_iforest = dict() results_point_difficulty_lof = dict() results_point_difficulty_ae_unsupervised = dict() results_point_difficulty_ae_supervised = dict() for dataset in data_original.keys(): print('train on dataset: {}'.format(dataset)) results_point_difficulty_lr[dataset] = dict() results_point_difficulty_gbm[dataset] = dict() results_point_difficulty_iforest[dataset] = dict() results_point_difficulty_lof[dataset] = dict() results_point_difficulty_ae_unsupervised[dataset] = dict() results_point_difficulty_ae_supervised[dataset] = dict() data_reg = data_original[dataset]['regular'] anom = data_original[dataset]['anom'].sort_values('point_difficulty') num_anom = np.round(settings['anom_freq'] * data_reg.shape[0] / \ (1 - settings['anom_freq'])) step = np.round(anom.shape[0] / (n_datasets + 1)) for i in range(n_datasets): y_pred_lr, y_pred_gbm, y_pred_iforest, y_pred_lof = [], [], [], [] y_pred_ae_unsupervised, y_pred_ae_supervised, y_true = [], [], [] #roc_auc_gbm, roc_auc_iforest, roc_auc_lof = [], [], [] data_anom = anom.iloc[ int(i * step):int(min(i * step + num_anom, anom.shape[0])), :] data_sample = pd.concat([data_reg, data_anom]).sample(frac=1)\ .reset_index(drop=True) X = data_sample.iloc[:, :-2] y = data_sample.iloc[:, -2] skf = StratifiedKFold(n_splits=3) for train_index, test_index in skf.split(X, y): X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] X_train_unsupervised = X_train[y_train == 0] y_true.append(y_test) # Logistic Regression: if settings['models_train']['lr']: lr = LogisticRegression() lr.fit(X_train, y_train) y_pred_lr.append(lr.predict_proba(X_test)[:, 1]) # GBM: if settings['models_train']['gbm']: gbm = GradientBoostingClassifier() gbm.fit(X_train, y_train) y_pred_gbm.append(gbm.predict_proba(X_test)[:, 1]) # Isolation Forest: if settings['models_train']['iforest']: iforest = IsolationForest() iforest.fit(X_train_unsupervised) decision_function = iforest.decision_function(X_test) y_pred_iforest.append(1 - np.interp(decision_function, \ (decision_function.min(), decision_function.max()), (0, 1))) # Local Outlier Factor (LOF): if settings['models_train']['lof']: lof = LocalOutlierFactor() lof.fit(X_train_unsupervised) decision_function = lof._decision_function(X_test) y_pred_lof.append(1 - np.interp(decision_function, \ (decision_function.min(), decision_function.max()), (0, 1))) # Autoencoder unsupervised if settings['models_train']['autoencoder_unsupervised']: input_dim = X_train_unsupervised.shape[1] ae = autoencoder.autoencoder_unsupervised( input_dim=input_dim) ae.fit(X_train_unsupervised, X_train_unsupervised, batch_size=50, epochs=2, verbose=0) X_test_pred = ae.predict(X_test) y_pred_ae_unsupervised.append(autoencoder.\ reconstruction_error(X_test, X_test_pred)) # Autoencoder supervised if settings['models_train']['autoencoder_supervised']: input_dim = X_train.shape[1] ae = autoencoder.autoencoder_supervised( input_dim=input_dim) y_train = pd.concat([X_train, y_train], axis=1) ae.fit(X_train, y_train, batch_size=50, epochs=2, verbose=0) X_test_pred = ae.predict(X_test) y_pred_ae_supervised.append(autoencoder.\ reconstruction_error(X_test, X_test_pred)) if settings['models_train']['lr']: mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc( y_true, y_pred_lr) results_point_difficulty_lr[dataset]\ [np.round(i / n_datasets, 2)] = (mean_fpr, mean_tpr, mean_auc) if settings['models_train']['gbm']: mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc( y_true, y_pred_gbm) results_point_difficulty_gbm[dataset][np.round(i / n_datasets, 2)] = \ (mean_fpr, mean_tpr, mean_auc) if settings['models_train']['iforest']: mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc( y_true, y_pred_iforest) results_point_difficulty_iforest[dataset]\ [np.round(i / n_datasets, 2)] = (mean_fpr, mean_tpr, mean_auc) if settings['models_train']['lof']: mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc( y_true, y_pred_lof) results_point_difficulty_lof[dataset][np.round(i / n_datasets, 2)] = \ (mean_fpr, mean_tpr, mean_auc) if settings['models_train']['autoencoder_unsupervised']: mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc( y_true, y_pred_ae_unsupervised) results_point_difficulty_ae_unsupervised[dataset]\ [np.round(i / n_datasets, 2)] = (mean_fpr, mean_tpr, mean_auc) if settings['models_train']['autoencoder_supervised']: mean_fpr, mean_tpr, mean_auc = create_mean_roc_auc( y_true, y_pred_ae_supervised) results_point_difficulty_ae_supervised[dataset]\ [np.round(i / n_datasets, 2)] = (mean_fpr, mean_tpr, mean_auc) timestr = time.strftime("%H%M%S") if settings['models_train']['lr']: name = 'results_point_difficulty_lr_{}'.format(timestr) save_results(results_point_difficulty_lr, results_dir, name) if settings['models_train']['gbm']: name = 'results_point_difficulty_gbm_{}'.format(timestr) save_results(results_point_difficulty_gbm, results_dir, name) if settings['models_train']['iforest']: name = 'results_point_difficulty_iforest_{}'.format(timestr) save_results(results_point_difficulty_iforest, results_dir, name) if settings['models_train']['lof']: name = 'results_point_difficulty_lof_{}'.format(timestr) save_results(results_point_difficulty_lof, results_dir, name) if settings['models_train']['autoencoder_unsupervised']: name = 'results_point_difficulty_ae_unsupervised_{}'.format(timestr) save_results(results_point_difficulty_ae_unsupervised, results_dir, name) if settings['models_train']['autoencoder_supervised']: name = 'results_point_difficulty_ae_supervised_{}'.format(timestr) save_results(results_point_difficulty_ae_supervised, results_dir, name)
def plot(X, y_pred, clf, indexes=[]): print("indexes: %s" % indexes) pca = PCA(n_components=2) scaler = MaxAbsScaler() X_scaled = scaler.fit_transform(X) def fullprint(*args, **kwargs): from pprint import pprint import numpy opt = numpy.get_printoptions() numpy.set_printoptions(threshold='nan') pprint(*args, **kwargs) numpy.set_printoptions(**opt) #print("*"*80) #print("*"*80) #print("X before transformation:") #fullprint(X_scaled) #print("*"*80) #print("*"*80) #print("*"*80) X = pca.fit_transform(X_scaled) #print("X after transformation:") #print(X) #print("*"*80) #print("*"*80) #print("*"*80) np.random.seed(42) # Generate train data #X = 0.3 * np.random.randn(100, 2) # Generate some abnormal novel observations #X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) #X = np.r_[X + 2, X - 2, X_outliers] # fit the model clf = LocalOutlierFactor(n_neighbors=20) # re-fitting a new model on the 2d data y_pred = clf.fit_predict(X) assert (len(X) == len(y_pred)) zipped = zip(X, y_pred) inliers = np.array([i[0] for i in zipped if i[1] == 1]) outliers = np.array([i[0] for i in zipped if i[1] == -1]) assert (len(y_pred) == len(inliers) + len(outliers)) call_outs = [] if len(indexes) > 0: assert (all([len(i) == len(X) for i in indexes])) for index in indexes: zip_index = zip(X, index) call_out = np.array([i[0] for i in zip_index if i[1] == 1]) call_outs.append(call_out) call_outs = np.array(call_outs) print("call_outs:") print(call_outs) # plot the level sets of the decision function xx, yy = np.meshgrid(np.linspace(-1, 1, 50), np.linspace(-1, 1, 50)) Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) print("*" * 80) print("Z:") print(Z.shape) print(Z) print("*" * 80) plt.title("Local Outlier Factor (LOF) for AIS-Scenario16") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) if len(call_outs) > 0: legend = [] for call_out, color in zip(call_outs, ["green", "purple", "orange"]): c = plt.scatter( call_out[:, 0], call_out[:, 1], c=color, # marker=".", alpha=0.1, s=20) legend.append(c) plt.legend(legend, ["removes r = x/ y", "removes long switch statement"], loc="upper left") else: a = plt.scatter(inliers[:, 0], inliers[:, 1], c='white', edgecolor='k', s=20) b = plt.scatter(outliers[:, 0], outliers[:, 1], c='red', edgecolor='k', s=20) plt.legend([a, b], ["typical behavior", "outliers"], loc="upper left") plt.axis('tight') max_x = max([i[0] for i in X]) min_x = min([i[0] for i in X]) max_y = max([i[1] for i in X]) min_y = min([i[1] for i in X]) plt.autoscale() #plt.xlim((min_x - 0.01, max_x + 0.01)) #plt.ylim((min_y - 0.01, max_y + 0.01)) #plt.xlim((-.2,.1)) #plt.ylim((-.2,.2)) plt.savefig("visualization1.png") plt.show()
def predict(x_train_s, x_test_s, x_test_b, model) -> None: train = model.predict(x_train_s) test_s = model.predict(x_test_s) test_b = model.predict(x_test_b) train = train.reshape((len(x_train_s), -1)) test_s = test_s.reshape((len(x_test_s), -1)) test_b = test_b.reshape((len(x_test_b), -1)) ms = MinMaxScaler() train = ms.fit_transform(train) test_s = ms.transform(test_s) test_b = ms.transform(test_b) clf = LocalOutlierFactor(n_neighbors=5) _ = clf.fit(train) z1 = -clf._decision_function(test_s) z2 = -clf._decision_function(test_b) TOP_K = 5 unsorted_max_indeces = np.argpartition(-z1, TOP_K)[:TOP_K] y = z1[unsorted_max_indeces] indices = np.argsort(-y) max_k_indices = unsorted_max_indeces[indices] plt.figure() for count, i in enumerate(max_k_indices): plt.subplot(1, TOP_K, count + 1) plt.imshow(x_test_s[i]) plt.title(f"index: {i}\n{z1[i]:.3e}") plt.tick_params(labelbottom=False, labelleft=False, labelright=False, labeltop=False) plt.tick_params(bottom=False, left=False, right=False, top=False) plt.show() plt.savefig("_data/x_test_s_top_k.png") unsorted_max_indeces = np.argpartition(-z2, TOP_K)[:TOP_K] y = z2[unsorted_max_indeces] indices = np.argsort(-y) max_k_indices = unsorted_max_indeces[indices] plt.figure() for count, i in enumerate(max_k_indices): plt.subplot(1, TOP_K, count + 1) plt.imshow(x_test_b[i]) plt.title(f"index: {i}\n{z2[i]:.3e}") plt.tick_params(labelbottom=False, labelleft=False, labelright=False, labeltop=False) plt.tick_params(bottom=False, left=False, right=False, top=False) plt.show() plt.savefig("_data/x_test_b_top_k.png") unsorted_max_indeces = np.argpartition(z1, TOP_K)[:TOP_K] y = z1[unsorted_max_indeces] indices = np.argsort(y) max_k_indices = unsorted_max_indeces[indices] plt.figure() for count, i in enumerate(max_k_indices): plt.subplot(1, TOP_K, count + 1) plt.imshow(x_test_s[i]) plt.title(f"index: {i}\n{z1[i]:.3e}") plt.tick_params(labelbottom=False, labelleft=False, labelright=False, labeltop=False) plt.tick_params(bottom=False, left=False, right=False, top=False) plt.show() plt.savefig("_data/x_test_s_under_k.png") unsorted_max_indeces = np.argpartition(z2, TOP_K)[:TOP_K] y = z2[unsorted_max_indeces] indices = np.argsort(y) max_k_indices = unsorted_max_indeces[indices] plt.figure() for count, i in enumerate(max_k_indices): plt.subplot(1, TOP_K, count + 1) plt.imshow(x_test_b[i]) plt.title(f"index: {i}\n{z2[i]:.3e}") plt.tick_params(labelbottom=False, labelleft=False, labelright=False, labeltop=False) plt.tick_params(bottom=False, left=False, right=False, top=False) plt.show() plt.savefig("_data/x_test_b_under_k.png") y_true = np.zeros(len(test_s) + len(test_b)) y_true[len(test_s):] = 1 # normal = 0, abnormal = 1 fpr, tpr, _ = metrics.roc_curve(y_true, np.hstack((z1, z2))) auc = metrics.auc(fpr, tpr) plt.plot(fpr, tpr, label=f"DOC(AUC = {auc}") plt.legend() plt.title("ROC curve") plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.grid(True) plt.show() plt.savefig("_data/roc_curve.png")
def main(camera_FPS, camera_width, camera_height, inference_scale, threshold, device): path = "pictures/" if not os.path.exists(path): os.mkdir(path) model_path = "OneClassAnomalyDetection-RaspberryPi3/DOC/model/" if os.path.exists(model_path): # LOF print("LOF model building...") x_train = np.loadtxt(model_path + "train.csv", delimiter=",") ms = MinMaxScaler() x_train = ms.fit_transform(x_train) # fit the LOF model clf = LocalOutlierFactor(n_neighbors=5) clf.fit(x_train) # DOC print("DOC Model loading...") if device == "MYRIAD": model_xml = "irmodels/tensorflow/FP16/weights.xml" model_bin = "irmodels/tensorflow/FP16/weights.bin" else: model_xml = "irmodels/tensorflow/FP32/weights.xml" model_bin = "irmodels/tensorflow/FP32/weights.bin" net = IENetwork(model=model_xml, weights=model_bin) plugin = IEPlugin(device=device) if device == "CPU": if platform.processor() == "x86_64": plugin.add_cpu_extension("lib/x86_64/libcpu_extension.so") exec_net = plugin.load(network=net) input_blob = next(iter(net.inputs)) print("loading finish") else: print("Nothing model folder") sys.exit(0) base_range = min(camera_width, camera_height) stretch_ratio = inference_scale / base_range resize_image_width = int(camera_width * stretch_ratio) resize_image_height = int(camera_height * stretch_ratio) if base_range == camera_height: crop_start_x = (resize_image_width - inference_scale) // 2 crop_start_y = 0 else: crop_start_x = 0 crop_start_y = (resize_image_height - inference_scale) // 2 crop_end_x = crop_start_x + inference_scale crop_end_y = crop_start_y + inference_scale fps = "" message = "Push [p] to take a picture" result = "Push [s] to start anomaly detection" flag_score = False picture_num = 1 elapsedTime = 0 score = 0 score_mean = np.zeros(10) mean_NO = 0 cap = cv2.VideoCapture(0) cap.set(cv2.CAP_PROP_FPS, camera_FPS) cap.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width) cap.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height) time.sleep(1) while cap.isOpened(): t1 = time.time() ret, image = cap.read() if not ret: break image_copy = image.copy() # prediction if flag_score == True: prepimg = cv2.resize(image, (resize_image_width, resize_image_height)) prepimg = prepimg[crop_start_y:crop_end_y, crop_start_x:crop_end_x] prepimg = np.array(prepimg).reshape( (1, inference_scale, inference_scale, 3)) prepimg = prepimg / 255 prepimg = prepimg.transpose((0, 3, 1, 2)) exec_net.start_async(request_id=0, inputs={input_blob: prepimg}) exec_net.requests[0].wait(-1) outputs = exec_net.requests[0].outputs["Reshape_"] outputs = outputs.reshape((len(outputs), -1)) outputs = ms.transform(outputs) score = -clf._decision_function(outputs) # output score if flag_score == False: cv2.putText(image, result, (camera_width - 350, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA) else: score_mean[mean_NO] = score[0] mean_NO += 1 if mean_NO == len(score_mean): mean_NO = 0 if np.mean(score_mean) > threshold: #red if score is big cv2.putText(image, "{:.1f} Score".format(np.mean(score_mean)), (camera_width - 230, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1, cv2.LINE_AA) else: # blue if score is small cv2.putText(image, "{:.1f} Score".format(np.mean(score_mean)), (camera_width - 230, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) # message cv2.putText(image, message, (camera_width - 285, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA) cv2.putText(image, fps, (camera_width - 164, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 1, cv2.LINE_AA) cv2.imshow("Result", image) # FPS elapsedTime = time.time() - t1 fps = "{:.0f} FPS".format(1 / elapsedTime) # quit or calculate score or take a picture key = cv2.waitKey(1) & 0xFF if key == ord("q"): break if key == ord("p"): cv2.imwrite(path + str(picture_num) + ".jpg", image_copy) picture_num += 1 if key == ord("s"): flag_score = True cv2.destroyAllWindows()
def analyze(data): # Convert this to python data for us to be able to run ML algorithms json_to_python = json.loads(data) # Data pre-processing here: per_size = dict() # IP-Response size hostlist = dict() for y in json_to_python: hostlist[y['HOST']] = 1 if y['HOST'] in per_size: per_size[y['HOST']].append(int(y['SIZE'])) else: per_size[y['HOST']] = [int(y['SIZE'])] log.debug( "*** Printing Input to analysis - 4 (1): K-means on IP and average response size ****" ) #####*****SIZE******#### #### Analysis #4 (1): IP address - Size of response received feature X = np.array([[0.00, 0.00]]) for x in hostlist: avg_size = mean(per_size[x]) log.debug(x + ": " + str(avg_size)) y = x.split(".") ip = "" for z in range(4): l = len(y[z]) l = 3 - l if (l > 0): zero = "" for t in range(3 - len(y[z])): zero = zero + "0" y[z] = zero + y[z] ip = ip + y[z] # log.debug( str(float(float(ip)/1000)) + ": " + str(avg_size)) le = [float(float(ip) / 1000), avg_size] X = np.vstack([X, le]) log.info( "******** Analysis #4 (3) : IP-Address and Response Size received: LocalOutlierFactor ********" ) # print kmeans.labels_ log.info( "******** Please check the image test-save-outlier-LOF.png saved in your working directory for more info. ********" ) ###################################################### ##Analysis : 4 (3): Outlier detection: np.random.seed(42) # fit the model clf = LocalOutlierFactor(n_neighbors=20) y_pred = clf.fit_predict(X) y_pred_outliers = y_pred[200:] # plot the level sets of the decision function xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Local Outlier Factor (LOF)") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) a = plt.scatter(X[:200, 0], X[:200, 1], c='white', edgecolor='k', s=20) b = plt.scatter(X[200:, 0], X[200:, 1], c='red', edgecolor='k', s=20) plt.axis('tight') # plt.xlim((-5, 5)) # plt.ylim((-5, 5)) plt.legend([a, b], ["normal observations", "abnormal observations"], loc="upper left") ##plt.show() plt.savefig('test-save-outlier-LOF.png')