def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(contamination=0.1).fit(X_train) clf2 = IsolationForest().fit(X_train) assert_array_equal(clf1.score_samples([[2., 2.]]), clf1.decision_function([[2., 2.]]) + clf1.offset_) assert_array_equal(clf2.score_samples([[2., 2.]]), clf2.decision_function([[2., 2.]]) + clf2.offset_) assert_array_equal(clf1.score_samples([[2., 2.]]), clf2.score_samples([[2., 2.]]))
def test_iforest_works(contamination): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test IsolationForest clf = IsolationForest(random_state=rng, contamination=contamination) clf.fit(X) decision_func = -clf.decision_function(X) pred = clf.predict(X) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_array_equal(pred, 6 * [1] + 2 * [-1])
def outlier_removal(df, col, method, params): if method == 'Isolation Forest': do_outlier_removal = IsolationForest(**params) if method == 'Local Outlier Factor': do_outlier_removal = LocalOutlierFactor(**params) else: method == None do_outlier_removal.fit(np.array(df[col])) if method == 'Isolation Forest': outlier_scores = do_outlier_removal.decision_function(np.array(df[col])) df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores is_outlier = do_outlier_removal.predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier if method == 'Local Outlier Factor': is_outlier = do_outlier_removal.fit_predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_ return df, do_outlier_removal
def test_iforest_performance(): """Test Isolation Forest performs well""" # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) X_train = np.r_[X + 2, X - 2] X_train = X[:100] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X_test = np.r_[X[100:], X_outliers] y_test = np.array([0] * 20 + [1] * 20) # fit the model clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train) # predict scores (the lower, the more normal) y_pred = - clf.decision_function(X_test) # check that there is at most 6 errors (false positive or false negative) assert_greater(roc_auc_score(y_test, y_pred), 0.98)
# Generate some regular novel observations X = 0.3 * rng.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green') c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red') plt.axis('tight') plt.xlim((-5, 5)) plt.ylim((-5, 5)) plt.legend([b1, b2, c], ["training observations", "new regular observations", "new abnormal observations"], loc="upper left")
X_test = np.r_[X + 2, X - 2] ##按行堆叠,shape(40,2) # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) ##shape(20,2) # fit the model clf = IsolationForest(max_samples=100, random_state=rng) clf.fit( X_train ) ## 训练出一个iForest,iForest为无监督的方法,但是也不能直接对无标记样本集预测,可以先fit无标记样本集,然后在predict y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) ##按列堆叠shape(100,2),并且得出决策边界 Z = Z.reshape(xx.shape) plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) ##画出决策边界,不同的区域颜色不同 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') plt.axis('tight') plt.xlim((-5, 5)) plt.ylim((-5, 5))
def Eval(clargs): __version__ = '1.0' usage = """train_flows [options] normaldatafile""" parser = OptionParser(usage=usage, version=__version__) parser.add_option("-x", "--vectorizerfile", action="store", type="string", \ default='/tmp/vectorizers.pkl', help="") parser.add_option("-v", "--verbose", action="store_true", default=False, \ help="enable verbose output") parser.add_option("-o", "--maliciousdatafile", action="store", type="string", \ default=None, help="An optional file of malicious http logs") parser.add_option("-m", "--maxfeaturesperbag", action="store", type="int", \ default=100, help="maximum number of features per bag") parser.add_option("-g", "--ngramsize", action="store", type="int", \ default=7, help="ngram size") parser.add_option("-f", "--features", action="store", type="string", \ default="01000100111111111111", help="An optional file for choosing which features to be extracted") parser.add_option("-t", "--maxtrainingfeatures", action="store", type="int", \ default=50000, help="maximum number of rows to train with per class") parser.add_option("-n", "--numtrees", action="store", type="int", \ default=200, help="number of trees in isolation forest") parser.add_option("-s", "--numsamples", action="store", type="int", \ default=8192, help="number of samples in each tree") Start=time.time() (opts, args) = parser.parse_args(clargs) if len(args) != 2: parser.error('Incorrect number of arguments') ftu=[] features = opts.features for i, j in enumerate(features): if opts.verbose: print(j, all_fields[i]) if j == 1 or j=='1': ftu.append(all_fields[i]) if opts.verbose: print ftu #ftu = ['method', 'user_agent', 'status_code'] # load the http data in to a data frame print('Loading HTTP data') df = load_brofile(args[0], fields_to_use) trainDf = load_brofile(args[1], fields_to_use) total_rows = len(df.index) if opts.verbose: print('Total number of rows: %d' % total_rows) if opts.maliciousdatafile != None: print('Reading malicious training data') df1 = load_brofile(opts.maliciousdatafile, fields_to_use) if opts.verbose: print('Read malicious data with %s rows ' % len(df1.index)) #if (len(df1.index) > opts.maxtrainingfeatures): # if opts.verbose: print('Too many malicious samples for training, downsampling to %d' % opts.maxtrainingfeatures) # df1 = df1.sample(n=opts.maxtrainingfeatures) #set the classes of the dataframes and then stitch them together in to one big dataframe df['class'] = 0 df1['class'] = 1 classedDf = pd.concat([df,df1], ignore_index=True) else: #we weren't passed a file containing class-1 data, so we should generate some of our own. noiseDf = create_noise_contrast(df, numSamples) if opts.verbose: print('Added %s rows of generated malicious data'%numSamples) df['class'] = 0 noiseDf['class'] = 1 classedDf = pd.concat([df,noiseDf], ignore_index=True) #that doesn't matter trainDf['class']=0; #spliting into training and evaluation sets classedDf['is_train']=False trainDf['is_train']=True enhancedDf = enhance_flow(pd.concat([trainDf,classedDf], ignore_index=True), ftu) # construct some vectorizers based on the data in the DF. We need to vectorize future log files the exact same way so we # will be saving these vectorizers to a file. vectorizers = build_vectorizers(enhancedDf, ftu, max_features=opts.maxfeaturesperbag, ngram_size=opts.ngramsize, verbose=opts.verbose) #use the vectorizers to featureize our DF into a numeric feature dataframe featureMatrix = featureize(enhancedDf, ftu, vectorizers, verbose=opts.verbose) #add the class column back in (it wasn't featurized by itself) featureMatrix['class'] = enhancedDf['class'] featureMatrix['is_train'] = enhancedDf['is_train'] #split out the train and test df's into separate objects train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False] #drop the is_train column, we don't need it anymore train = train.drop('is_train', axis=1) test = test.drop('is_train', axis=1) #print('Calculating features') Trees=opts.numtrees Samples=opts.numsamples clf = IsolationForest(n_estimators=Trees, max_samples=Samples) clf.fit(train.drop('class', axis=1)) testnoclass = test.drop('class', axis=1) print('Predicting') test.is_copy = False test['prediction'] = clf.decision_function(testnoclass) + 0.5 print('Analyzing') #get the class-1 (outlier/anomaly) rows from the feature matrix, and drop the prediction so we can investigate them ##From Here Left=0.001 Right=0.01 fpr, tpr, thresholds = roc_curve(test['class'], test['prediction'], pos_label=0) F=interpolate.interp1d(fpr, tpr, assume_sorted=True) x=np.logspace(np.log10(Left), np.log10(Right)) y=F(x) roc_auc=auc(x, y) plt.figure() plt.xscale('log') plt.plot(fpr, tpr, color='b') plt.plot(x,y, color='r') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.plot(plt.xlim(), plt.ylim(), ls="--", c=".3") plt.savefig("fig3.png") plt.clf() plt.close('all') print('Area Under the Curve = %.6f' %(roc_auc)) Min, Sec= divmod( int(time.time() - Start), 60 ) #print Min, Sec target= open('Results.txt', 'a') target.write(str(Trees)+' ') target.write(str(Samples)+' ') target.write(str(Min)+' ') target.write(str(Sec)+' ') target.write(str(roc_auc)) target.write("\n") target.write(str(features)) target.write("\n") target.write("\n") target.close() print("Minutes: %d, Seconds: %d" % (int(Min), int(Sec)) ) return roc_auc
class IForest(BaseDetector): """Wrapper of scikit-learn Isolation Forest with more functionalities. The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. See :cite:`liu2008isolation,liu2012isolation` for details. Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of such random trees, is a measure of normality and our decision function. Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. :param n_estimators: The number of base estimators in the ensemble. :type n_estimators: int, optional (default=100) :param max_samples: The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If "auto", then `max_samples=min(256, n_samples)`. :type max_samples: int or float, optional (default="auto") :param contamination: The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. :type contamination: float in (0., 0.5), optional (default=0.1) :param max_features: The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. :type max_features: int or float, optional (default=1.0) :param bootstrap: If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. :type bootstrap: bool, optional (default=False) :param n_jobs: The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores :type n_jobs: int, optional (default=1) :param random_state: If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. :type random_state: int, RandomState instance or None, optional (default=None) :param verbose: Controls the verbosity of the tree building process. :type verbose: int, optional (default=0) :var estimators\_: The collection of fitted sub-estimators. :vartype estimators\_: list :var estimators_samples\_: The subset of drawn samples (i.e., the in-bag samples) for each base estimator. :vartype estimators_samples\_: list or arrays :var max_samples\_: The actual number of samples. :vartype max_samples\_: int """ def __init__(self, n_estimators=100, max_samples="auto", contamination=0.1, max_features=1., bootstrap=False, n_jobs=1, random_state=None, verbose=0): super(IForest, self).__init__(contamination=contamination) self.n_estimators = n_estimators self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y=None): # Validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.detector_.fit(X=X, y=None, sample_weight=None) # invert decision_scores_. Outliers comes with higher outlier scores self.decision_scores_ = self.detector_.decision_function(X) * -1 self._process_decision_scores() return self def decision_function(self, X): check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # invert decision_scores_. Outliers comes with higher outlier scores return self.detector_.decision_function(X) * -1 @property def estimators_(self): """The collection of fitted sub-estimators. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_ @property def estimators_samples_(self): """The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_samples_ @property def max_samples_(self): """The actual number of samples. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.max_samples_
class IForest(BaseDetector): """Wrapper of scikit-learn Isolation Forest with more functionalities. The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. See :cite:`liu2008isolation,liu2012isolation` for details. Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of such random trees, is a measure of normality and our decision function. Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. Parameters ---------- n_estimators : int, optional (default=100) The number of base estimators in the ensemble. max_samples : int or float, optional (default="auto") The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If "auto", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling). contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. max_features : int or float, optional (default=1.0) The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. bootstrap : bool, optional (default=False) If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. n_jobs : integer, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. behaviour : str, default='old' Behaviour of the ``decision_function`` which can be either 'old' or 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` change to match other anomaly detection algorithm API which will be the default behaviour in the future. As explained in details in the ``offset_`` attribute documentation, the ``decision_function`` becomes dependent on the contamination parameter, in such a way that 0 becomes its natural threshold to detect outliers. .. versionadded:: 0.7.0 ``behaviour`` is added in 0.7.0 for back-compatibility purpose. .. deprecated:: 0.20 ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be possible in 0.22. .. deprecated:: 0.22 ``behaviour`` parameter will be deprecated in sklearn 0.22 and removed in 0.24. .. warning:: Only applicable for sklearn 0.20 above. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : int, optional (default=0) Controls the verbosity of the tree building process. Attributes ---------- estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. estimators_samples_ : list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. max_samples_ : integer The actual number of samples decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, n_estimators=100, max_samples="auto", contamination=0.1, max_features=1., bootstrap=False, n_jobs=1, behaviour='old', random_state=None, verbose=0): super(IForest, self).__init__(contamination=contamination) self.n_estimators = n_estimators self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.n_jobs = n_jobs self.behaviour = behaviour self.random_state = random_state self.verbose = verbose def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'}) # to IsolationForest that shifts the location of the anomaly scores # noinspection PyProtectedMember sklearn_version = _get_sklearn_version() if sklearn_version == 21: self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, behaviour=self.behaviour, random_state=self.random_state, verbose=self.verbose) # Do not pass behaviour argument when sklearn version is < 0.20 or >0.21 else: # pragma: no cover self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.detector_.fit(X=X, y=None, sample_weight=None) # invert decision_scores_. Outliers comes with higher outlier scores. self.decision_scores_ = invert_order( self.detector_.decision_function(X)) self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # invert outlier scores. Outliers comes with higher outlier scores return invert_order(self.detector_.decision_function(X)) @property def estimators_(self): """The collection of fitted sub-estimators. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_ @property def estimators_samples_(self): """The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_samples_ @property def max_samples_(self): """The actual number of samples. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.max_samples_ @property def feature_importances_(self): """The impurity-based feature importance. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. .. warning:: impurity-based feature importance can be misleading for high cardinality features (many unique values). See https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html as an alternative. Returns ------- feature_importances_ : ndarray of shape (n_features,) The values of this array sum to 1, unless all trees are single node trees consisting of only the root node, in which case it will be an array of zeros. """ check_is_fitted(self) all_importances = Parallel(n_jobs=self.n_jobs)( delayed(getattr)(tree, "feature_importances_") for tree in self.detector_.estimators_ if tree.tree_.node_count > 1) if not all_importances: return np.zeros(self.n_features_in_, dtype=np.float64) all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances)
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train IsolationForest clf_name = 'IF' clf = IsolationForest(random_state=0) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.predict(X_train) # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_function(X_train) # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # Step 2: Determine the cut point import matplotlib.pyplot as plt plt.hist(y_test_scores, bins='auto') plt.title("Histogram with IF Anomaly Scores") plt.show() test_scores = pd.DataFrame({'Scores': y_test_scores, 'Labels': y_test_pred}) pd.DataFrame({ 'Outliers': test_scores.groupby('Labels').get_group(-1).Scores, 'Inlierss': test_scores.groupby('Labels').get_group(1).Scores
class IForest(RandomSplitForest): def __init__(self, n_estimators=100, max_samples="auto", contamination=0.1, max_features=1., bootstrap=False, n_jobs=1, replace_frac=0.2, random_state=None, verbose=0): RandomSplitForest.__init__(self, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, n_jobs=n_jobs, random_state=random_state, verbose=verbose) self.contamination = contamination # The fraction of trees replaced when new window of data arrives self.replace_frac = replace_frac self.ifor = None self.estimators_features_ = None self.buffer = None self.updated = False def fit(self, X, y=None, sample_weight=None): self.ifor = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.ifor.fit(X, y, sample_weight) self.estimators_ = self.ifor.estimators_ self.estimators_features_ = self.ifor.estimators_features_ self.updated = False def _fit(self, X, y, max_samples, max_depth, sample_weight=None): raise NotImplementedError("method _fit() not supported") def decision_function(self, X): if self.updated: logger.debug("WARN: The underlying isolation forest was updated and " + "using calling decision_function() on it will likely return inconsistent results.") return self.ifor.decision_function(X) def supports_streaming(self): return True def add_samples(self, X, current=True): if current: raise ValueError("IForest does not support adding to current instance set.") if self.buffer is None: self.buffer = X else: self.buffer = np.vstack([self.buffer, X]) def update_trees_by_replacement(self, X=None, replace_trees=None): if X is None: X = self.buffer if X is None: logger.warning("No new data for update") return None if replace_trees is not None: replace_set = set(replace_trees) n_new_trees = len(replace_set) if n_new_trees < 0: raise ValueError("Replacement set is larger than allowed") old_tree_indexes_replaced = replace_trees old_tree_indexes_retained = np.array([i for i in range(len(self.estimators_)) if i not in replace_set], dtype=int) else: n_new_trees = int(self.replace_frac * len(self.estimators_)) old_tree_indexes_replaced = np.arange(0, n_new_trees, dtype=int) old_tree_indexes_retained = np.arange(n_new_trees, len(self.estimators_)) if n_new_trees > 0: new_ifor = IsolationForest(n_estimators=n_new_trees, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) new_ifor.fit(X, y=None, sample_weight=None) # retain estimators and features self.estimators_ = [self.estimators_[i] for i in old_tree_indexes_retained] self.estimators_features_ = [self.estimators_features_[i] for i in old_tree_indexes_retained] # append the new trees at the end of the list of older trees for estimator, features in zip(new_ifor.estimators_, new_ifor.estimators_features_): self.estimators_.append(estimator) self.estimators_features_.append(features) # Now, update the underlying isolation forest # NOTE: This might make the model inconsistent self.ifor.estimators_ = self.estimators_ self.ifor.estimators_features_ = self.estimators_features_ new_estimators = new_ifor.estimators_ else: new_estimators = None self.updated = True self.buffer = None if False: logger.debug("IForest update_trees_by_replacement(): n_new_trees: %d, samples: %s" % (n_new_trees, str(X.shape))) # we return lists in order to support feature groups in multiview forest (see IForestMultiview) return [old_tree_indexes_replaced], [old_tree_indexes_retained], [new_estimators] def update_model_from_stream_buffer(self, replace_trees=None): return self.update_trees_by_replacement(self.buffer)
# Generate some regular novel observations X = 0.3 * rng.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') plt.axis('tight') plt.xlim((-5, 5)) plt.ylim((-5, 5))
index += 1 print(x_train.shape) X_train = x_train rng = np.random.RandomState(42) isofortrain = IsolationForest(n_estimators=1000, max_samples='auto', contamination=.20, max_features=1, random_state=rng, n_jobs=-1) isofortrain.fit(X_train) anomalytrain = isofortrain.decision_function(X_train) predicttrain = isofortrain.predict(X_train) len_predictrain = len(predicttrain) print("len_predictrain", len_predictrain) num_iforest_diff = 0 for i in predicttrain: if i == -1: num_iforest_diff += 1 print("num_iforest_diff", num_iforest_diff) same = 0 index = 0
print(x_value.shape) # Print shapes print(x_value.shape) print(y_value.shape) #Algorithms used: Random Isolation, LocalOutlier factor are common anomaly detection methods random_isolation = IsolationForest(max_samples=len(x_value), contamination=outlier_value, random_state=3) local_outlier = LocalOutlierFactor(n_neighbors=12, contamination=outlier_value) n_outlier = len(fraudal_count) #fit and predict random_isolation.fit(x_value) score_prediction = random_isolation.decision_function(x_value) y_predict_lof = random_isolation.predict(x_value) y_predict_isf = local_outlier.fit_predict(x_value) score_prediction = local_outlier.negative_outlier_factor_ #Change the value to 0 for valid and 1 for fradual cases. y_predict_isf[y_predict_isf == 1] = 0 y_predict_isf[y_predict_isf == -1] = 1 y_predict_lof[y_predict_lof == 1] = 0 y_predict_lof[y_predict_lof == -1] = 1 n_error_isf = (y_predict_isf != y_value).sum() n_error_lof = (y_predict_lof != y_value).sum() print("Error value for Isolation forest ", n_error_isf) print("Error value for local outlier function ", n_error_lof)
#ocsvm = OneClassSVM(kernel='linear', degree=2, gamma='auto', nu=0.5) ocsvm = OneClassSVM(gamma='auto', nu=0.01) lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_train) print('LocalOutlierFactor processing...') lof.fit(X_train) s_X_lof = lof.decision_function(X_train) print('OneClassSVM processing...') ocsvm.fit(X_train) s_X_ocsvm = ocsvm.decision_function(X_train).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) print("t ist: " ,t) print("t_max ist : " , t_max)
def detect(self): ''' 利用孤立森林 isolation forest 进行离群点检测 ''' # 获得预处理之后的数据 data = self.preprocess() # 异常点检测 # 创建 IsolationForest ilf = IsolationForest( n_estimators=self.n_estimators, n_jobs=-1, # 使用全部cpu verbose=self.verbose, contamination=self.contamination, # 离群点的比例 ) # 是否保存/加载模型的控制流 if self.isSaveModel and self.isLoadModel: # isSaveModel = True & isLoadModel = True # 训练并保存模型到本地,然后继续预测 # 训练 print('Model training...') ilf.fit(data) # 保存模型到本地 print('Saving model to `%s`...' % self.modelname) with open(self.modelname, 'wb') as fp: pickle.dump(ilf, fp) elif self.isSaveModel: # isSaveModel = True & isLoadModel = False # 训练并保存模型到本地,然后不再继续预测 # 训练 print('Model training...') ilf.fit(data) # 保存模型到本地 print('Saving model to `%s`...' % self.modelname) with open(self.modelname, 'wb') as fp: pickle.dump(ilf, fp) print('Don\'t predict.') return elif self.isLoadModel: # isSaveModel = False & isLoadModel = True # 直接加载本地模型,然后继续预测 # 加载本地模型 print('Loading model from `%s`...' % self.modelname) with open(self.modelname, 'rb') as fp: ilf = pickle.load(fp) else: # isSaveModel = False & isLoadModel = False # 只训练不保存模型,然后继续预测 # 训练 print('Model training...') ilf.fit(data) # 预测 print('Outliers predicting...') shape = data.shape[0] all_pred = [] all_score = [] for i in range(int(shape / self.batch) + 1): start = i * self.batch end = (i + 1) * self.batch batch_test = data[start:end] # 预测 # 返回值:+1 表示正常样本, -1表示异常样本 pred = ilf.predict(batch_test) # 返回样本的异常评分。 值越小表示越有可能是异常样本 score = ilf.decision_function(batch_test) all_pred.extend(pred) all_score.extend(score) data['timestamp'] = self.origin_data['timestamp'] data['is_outlier'] = all_pred data['outlier_score'] = all_score # 转换输出列值 data['timestamp'] = data['timestamp'].astype('int64') data.loc[data.is_outlier == 1, 'is_outlier'] = 0 data.loc[data.is_outlier == -1, 'is_outlier'] = 1 print('Writing `%s`...' % self.output_filename) data.to_csv(self.output_filename, columns=['timestamp', 'outlier_score', 'is_outlier'], header=True, index=0)
pp = PdfPages(plotfolder + 'scatterplots.pdf') for j, features in enumerate(feature_pairs): X, Y = features[0], features[1] print j, 'of', len(feature_pairs) pair_features = np.array([INFO[features[0]], INFO[features[1]]]).T forest = IsolationForest( n_estimators=100, #max_samples=1000, random_state=0, contamination=num_outlier / 343546.0 # number of nodes ) fig = scatter_plot(INFO[X], INFO[Y], INFO['IDs'], discription[Y], discription[X], discription[Y] + ' vs ' + discription[X], compare_value[X]) forest.fit(pair_features) scores = forest.decision_function(pair_features[outlier_ids, :]) rank_list = sorted([(outliers[i], -s) for (i, s) in enumerate(scores)], key=lambda x: x[1], reverse=True) rank_matrix.append(rank_list) pp.close() print rank_matrix # runs, properly till this, why is generate_graph returning nothing? scaled_matrix, normal_matrix = ranklist.generate_graph(P_val, num_outlier, rank_matrix) plots = plotSpot(budget, scaled_matrix, "SpellOut") frequencies = generate_frequency_list(plots, scaled_matrix) for i, plot in enumerate(plots): fig = scatter_outliers(plot, INFO['IDs'], frequencies)
n_samples_train = n_samples // 2 X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print('--- Fitting the IsolationForest estimator...') model = IsolationForest(n_jobs=-1, random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower, the more abnormal print("--- Preparing the plot elements...") if with_decision_function_histograms: fig, ax = plt.subplots(3, sharex=True, sharey=True) bins = np.linspace(-0.5, 0.5, 200) ax[0].hist(scoring, bins, color='black') ax[0].set_title('Decision function for %s dataset' % dat) ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data') ax[1].legend(loc="lower right") ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers') ax[2].legend(loc="lower right") # Show ROC Curves predict_time = time() - tstart fpr, tpr, thresholds = roc_curve(y_test, scoring)
def test_behaviour_param(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(behaviour='old').fit(X_train) clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train) assert_array_equal(clf1.decision_function([[2., 2.]]), clf2.decision_function([[2., 2.]]))
#print(marks.head(10)) #print(marks) ### for only Maths ### #model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.2), max_features=1.0) #model.fit(marks[['Mathematics']]) # Prediction #marks['anomailes_scores_math']=model.decision_function(marks[['Mathematics']]) #marks['anomaly_math']=model.predict(marks[['Mathematics']]) # here, 1 for good data and -1 for bad data #print(marks) ## for both Eng and Maths ## model = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.2), max_features=1.0) model.fit(marks[['English', 'Mathematics']]) # Prediction marks['anomailes_scores_both'] = model.decision_function( marks[['English', 'Mathematics']]) marks['anomaly_for_both'] = model.predict(marks[['English', 'Mathematics']]) # here, 1 for good data and -1 for bad data print(marks)
def insights_model(self, entry_info, repo_id): logging.info("Discovering insights for task with entry info: {}\n".format(entry_info)) """ Collect data """ base_url = 'http://{}:{}/api/unstable/repo-groups/9999/repos/{}/'.format( self.config['api_host'], self.config['api_port'], repo_id) # Dataframe to hold all endpoint results # Subtract configurable amount of time begin_date = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=self.training_days) index = pd.date_range(begin_date, periods=self.training_days, freq='D') df = pd.DataFrame(index=index) # Hit and discover insights for every endpoint we care about for endpoint, field in self.metrics.items(): # Hit endpoint url = base_url + endpoint logging.info("Hitting endpoint: " + url + "\n") try: data = requests.get(url=url).json() except: data = json.loads(json.dumps(requests.get(url=url).text)) if len(data) == 0: logging.info("Endpoint with url: {} returned an empty response. Moving on to next endpoint.\n".format(url)) continue if 'date' not in data[0]: logging.info("Endpoint {} is not a timeseries, moving to next endpoint.\n".format(endpoint)) continue metric_df = pd.DataFrame.from_records(data) metric_df.index = pd.to_datetime(metric_df['date'], utc=True).dt.date df = df.join(metric_df[field]).fillna(0) df.rename(columns={field: "{} - {}".format(endpoint, field)}, inplace=True) """ End collect endpoint data """ # If none of the endpoints returned data if df.size == 0: logging.info("None of the provided endpoints provided data for this repository. Anomaly detection is 'done'.\n") self.register_task_completion(entry_info, repo_id, "insights") return """ Deletion of old insights """ # Delete previous insights not in the anomaly_days param min_date = datetime.datetime.now() - datetime.timedelta(days=self.anomaly_days) logging.info("MIN DATE: {}\n".format(min_date)) logging.info("Deleting out of date records ...\n") delete_record_SQL = s.sql.text(""" DELETE FROM repo_insights_records WHERE repo_id = :repo_id AND ri_date < :min_date """) result = self.db.execute(delete_record_SQL, repo_id=repo_id, min_date=min_date) logging.info("Deleting out of date data points ...\n") delete_points_SQL = s.sql.text(""" DELETE FROM repo_insights USING ( SELECT ri_metric, ri_field FROM ( SELECT * FROM repo_insights WHERE ri_fresh = TRUE AND repo_id = :repo_id AND ri_date < :min_date ) old_insights ) to_delete WHERE repo_insights.ri_metric = to_delete.ri_metric AND repo_insights.ri_field = to_delete.ri_field """) result = self.db.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) # get table values to check for dupes later on insight_table_values = self.get_table_values(['*'], ['repo_insights_records'], where_clause="WHERE repo_id = {}".format(repo_id)) to_model_columns = df.columns[0:len(self.metrics)+1] model = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(self.contamination), \ max_features=1.0, bootstrap=False, n_jobs=-1, random_state=32, verbose=0) model.fit(df[to_model_columns]) def classify_anomalies(df,metric): df = df.sort_values(by='date_col', ascending=False) # Shift metric values by one date to find the percentage chage between current and previous data point df['shift'] = df[metric].shift(-1) df['percentage_change'] = ((df[metric] - df['shift']) / df[metric]) * 100 # Categorise anomalies as 0 - no anomaly, 1 - low anomaly , 2 - high anomaly df['anomaly_class'].loc[df['anomaly_class'] == 1] = 0 df['anomaly_class'].loc[(df['anomaly_class'] == -1) & (df[metric] != 0) & (df[metric] != 1)] = 2 max_anomaly_score = df['score'].loc[df['anomaly_class'] == 2].max() medium_percentile = df['score'].quantile(0.24) df['anomaly_class'].loc[(df['score'] > max_anomaly_score) & (df['score'] <= medium_percentile)] = 1 return df for i, metric in enumerate(to_model_columns): # Fit the model to the data returned from the endpoints model.fit(df.iloc[:,i:i+1]) pred = model.predict(df.iloc[:,i:i+1]) # Create df and adopt previous index from when we called the endpoints anomaly_df = pd.DataFrame() anomaly_df['date_col'] = df.index anomaly_df.index = df.index # Find decision function to find the score and classify anomalies anomaly_df['score'] = model.decision_function(df.iloc[:,i:i+1]) anomaly_df[metric] = df.iloc[:,i:i+1] anomaly_df['anomaly_class'] = pred # Get the indexes of outliers in order to compare the metrics with use case anomalies if required outliers = anomaly_df.loc[anomaly_df['anomaly_class'] == -1] outlier_index = list(outliers.index) anomaly_df = classify_anomalies(anomaly_df,metric) # Filter the anomaly_df by days we want to detect anomalies begin_detection_date = datetime.datetime.now() - datetime.timedelta(days=self.anomaly_days) detection_tuples = anomaly_df.index > begin_detection_date anomaly_df = anomaly_df.loc[detection_tuples] # Make a copy of the df for logging of individual tuples in the repo_insights table anomaly_df_copy = anomaly_df.copy() # Calculate mean mean = anomaly_df[metric].mean() # Make columns numeric for argmax to function properly for col in anomaly_df.columns: anomaly_df[col] = pd.to_numeric(anomaly_df[col]) # Split into endpoint and field name split = metric.split(" - ") most_recent_anomaly_date = None most_recent_anomaly = None insight_count = 0 while True: if anomaly_df.loc[anomaly_df['anomaly_class'] == 2].empty: logging.info("No more anomalies to be found for metric: {}\n".format(metric)) break next_recent_anomaly_date = anomaly_df.loc[anomaly_df['anomaly_class'] == 2]['anomaly_class'].idxmax() logging.info("Next most recent date: \n{}\n".format(next_recent_anomaly_date)) next_recent_anomaly = anomaly_df.loc[anomaly_df.index == next_recent_anomaly_date] logging.info("Next most recent anomaly: \n{}\n{}\n".format(next_recent_anomaly.columns.values, next_recent_anomaly.values)) if insight_count == 0: most_recent_anomaly_date = next_recent_anomaly_date most_recent_anomaly = next_recent_anomaly # Format numpy 64 date into timestamp date64 = next_recent_anomaly.index.values[0] ts = (date64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's') ts = datetime.datetime.utcfromtimestamp(ts) insight_exists = ((insight_table_values['ri_date'] == ts) & \ (insight_table_values['ri_metric'] == split[0]) & (insight_table_values['ri_field'] == split[1])).any() if not insight_exists: # Insert record in records table and send record to slack bot record = { 'repo_id': repo_id, 'ri_metric': split[0], 'ri_field': split[1], 'ri_value': next_recent_anomaly.iloc[0][metric], 'ri_date': ts, 'ri_score': next_recent_anomaly.iloc[0]['score'], 'ri_detection_method': 'Isolation Forest', "tool_source": self.tool_source, "tool_version": self.tool_version, "data_source": self.data_source } result = self.db.execute(self.repo_insights_records_table.insert().values(record)) logging.info("Primary key inserted into the repo_insights_records table: {}\n".format( result.inserted_primary_key)) self.results_counter += 1 # Send insight to Jonah for slack bot self.send_insight(record, abs(next_recent_anomaly.iloc[0][metric] - mean)) insight_count += 1 else: logging.info("Duplicate insight found, skipping insertion. " "Continuing iteration of anomalies...\n") anomaly_df = anomaly_df[anomaly_df.index < next_recent_anomaly_date] # If no insights for this metric were found, then move onto next metric # (since there is no need to insert the endpoint results below) if insight_count == 0: continue # Begin inserting to table to build frontend charts for tuple in anomaly_df_copy.itertuples(): try: # Format numpy 64 date into timestamp date64 = tuple.Index ts = (date64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's') ts = datetime.datetime.utcfromtimestamp(ts) data_point = { 'repo_id': repo_id, 'ri_metric': split[0], 'ri_field': split[1], 'ri_value': tuple._3, 'ri_date': ts, 'ri_fresh': 0 if date64 < most_recent_anomaly_date else 1, 'ri_score': most_recent_anomaly.iloc[0]['score'], 'ri_detection_method': 'Isolation Forest', "tool_source": self.tool_source, "tool_version": self.tool_version, "data_source": self.data_source } result = self.db.execute(self.repo_insights_table.insert().values(data_point)) logging.info("Primary key inserted into the repo_insights table: {}\n".format( result.inserted_primary_key)) logging.info("Inserted data point for metric: {}, date: {}, value: {}\n".format(metric, ts, tuple._3)) except Exception as e: logging.info("error occurred while storing datapoint: {}\n".format(repr(e))) break self.register_task_completion(entry_info, repo_id, "insights")
def detect_anomalies(self, data, **params): iso_forest = IsolationForest(verbose=1) iso_forest.set_params(**params) iso_forest.fit(data) return iso_forest.decision_function( data) # The anomaly score. The lower, the more abnormal.
strong_outlier = False try: if frequency_tree[0][0] > ( 4 * float(frequency_tree[1][0]) ): #if highest frequency is 4 times higher as the second one, remove from trainingdata frequency_tree = frequency_tree[1:] col = [red] #classify highest value as Hot strong_outlier = True except: pass #Create the isolation forest model and train and test. clf = IsolationForest(random_state=0, bootstrap=False).fit(frequency_tree) outlier_score = clf.decision_function( frequency_tree ) #there is also predict as a method (value 1 or -1) output outlier_classification = clf.predict( frequency_tree) # -1 is outlier and 1 is inliner #label the results of the classification x_pos = [i for i, _ in enumerate(mutation)] #if there is no negative value, blau und green outcome_score = sum( 1 for number in outlier_score if number < 0) #checking if there is a negative value outcome_classification = sum( 1 for number in outlier_classification if number < 0 ) #checking if classification thinks the same because sometimes a outliner-score accidentally swaps to a negative number #coloring the data according to the outliner score generated with following criteria
bootstrap:布尔型参数,默认取False,表示构建iTree时有放回地进行抽样; ''' # 设置训练样本数及异常样本比例 n_samples = 10000 outliers_fraction = 0.25 n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) # //表示整数除法 rng = np.random.RandomState(123) X = 0.3 * rng.randn(n_inliers // 2, 2) # 构建正常样本与异常样本 X_train = np.r_[X + 2, X - 2] outliers = rng.uniform(low=-6, high=6, size=(n_outliers, 2)) # 正常样本与异常样本的融合 X_train = np.r_[X_train, outliers] clf = IsolationForest(contamination=outliers_fraction, random_state=2018, n_jobs=-1, behaviour="new") # predict / fit_predict方法返回每个样本是否为正常值,若返回1表示正常值,返回-1表示异常值 y_pred_train = clf.fit_predict(X_train) pred = np.array(['正常' if i==1 else '异常' for i in y_pred_train]) # 分数越小于0,越有可能是异常值 scores_pred = clf.decision_function(X_train) dict_ = {'anomaly_score':scores_pred, 'y_pred':y_pred_train, 'result':pred} scores = pd.DataFrame(dict_) print(scores.sample(5))
barmode='group', height=400) fig.update_yaxes(title_text="Model Metrics") fig.update_layout(title_text="Model Performance") fig.show() # - # ## Feature Selection, resampling and data transformation # + #Anomaly Detection from sklearn.ensemble import IsolationForest iforest = IsolationForest(n_estimators=100, contamination=0.01) pred = iforest.fit_predict(X_train_prepared) score = iforest.decision_function(X_train_prepared) from numpy import where anom_index = where(pred == -1) values = X.iloc[anom_index] for i in values.index: X_train_prepared = X_train_prepared.drop(i) y_train = y_train.drop(i) # - ctr = len(values) print("Number of observations dropped = {}".format(ctr)) # + # Modelling with balanced target
w = 30 m = 10 st = time.time() training_paa = paa.ts_to_PAA(w, m, training_ts_list) testing_paa = paa.ts_to_PAA(w, m, testing_ts_list) print("PAA time = {}".format(time.time() - st)) #print (training_paa) print(training_paa.shape) t1 = time.time() IF1 = IsolationForest(max_samples=256, n_estimators=100, contamination=0.01) IF1.fit(training_paa) #cont = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5] cont = [0.01, 0.02, 0.04, 0.08, 0.1, 0.2, 0.4, 0.5] for val in cont: anomaly_score = IF1.decision_function(testing_paa) anomaly_score = [0 for x in range(w)] + [z for z in anomaly_score] predict_score = predict(anomaly_score, val) predict_socre = [z for z in predict_score] plot_graphs(training_ts_list, testing_ts_list, anomaly_score, predict_score) plt.subplot(411) plt.title('Training Signal') plt.xlabel('Instance Number') plt.ylabel('Value') plt.plot(range(len(training_ts_list[0])), training_ts_list[0], color='b') plt.plot(range(len(training_ts_list[1])), training_ts_list[1], color='r') plt.plot(range(len(training_ts_list[2])), training_ts_list[2], color='g') plt.subplot(412)
class IForest(BaseDetector): """Wrapper of scikit-learn Isolation Forest with more functionalities. The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. See :cite:`liu2008isolation,liu2012isolation` for details. Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of such random trees, is a measure of normality and our decision function. Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. Parameters ---------- n_estimators : int, optional (default=100) The number of base estimators in the ensemble. max_samples : int or float, optional (default="auto") The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If "auto", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling). contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. max_features : int or float, optional (default=1.0) The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. bootstrap : boolean, optional (default=False) If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. n_jobs : integer, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : int, optional (default=0) Controls the verbosity of the tree building process. Attributes ---------- estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. estimators_samples_ : list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. max_samples_ : integer The actual number of samples decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, n_estimators=100, max_samples="auto", contamination=0.1, max_features=1., bootstrap=False, n_jobs=1, random_state=None, verbose=0): super(IForest, self).__init__(contamination=contamination) self.n_estimators = n_estimators self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.detector_.fit(X=X, y=None, sample_weight=None) # invert decision_scores_. Outliers comes with higher outlier scores. self.decision_scores_ = invert_order( self.detector_.decision_function(X)) self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # invert outlier scores. Outliers comes with higher outlier scores return invert_order(self.detector_.decision_function(X)) @property def estimators_(self): """The collection of fitted sub-estimators. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_ @property def estimators_samples_(self): """The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_samples_ @property def max_samples_(self): """The actual number of samples. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.max_samples_
states = ['failed', 'successful'] #only select successful or failed projects kickstarters = new_dataset[new_dataset.state.isin(states)] kickstarters = kickstarters.dropna() #drop empty rows kickstarters = kickstarters.reset_index(drop=True) ############ Detecting the anomalies in the dataset ############## model = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.01), max_features=1.0, random_state=0) model.fit(kickstarters[['goal']]) #identify anomalies kickstarters['score'] = model.decision_function(kickstarters[['goal']]) kickstarters['anomaly'] = model.predict(kickstarters[['goal']]) kickstarters.head(20) #index of observations that are anomalies anomaly = kickstarters.loc[kickstarters['anomaly'] == -1] anomaly_index = list(anomaly.index) #only keep projects that are not anomalies kickstarters = kickstarters[~kickstarters.index.isin(anomaly_index)] ############# Calculated fields ################################## #1 - lenght of the name of the project kickstarters['name_length'] = kickstarters['name'].str.len()
def test_ioforest(stamp): ''' with open('maintenanceapp/static/net.csv','rb') as file: reader = csv.DictReader(file) test = [test for test in reader] ''' ''' with open('test1.csv','rb') as file1: reader1 = csv.DictReader(file1) test1 = [item for item in reader1] ''' client = InfluxDBClient('localhost', port=8086, database='telegraf') #stamp = time.time() num1 = int(stamp) num2 = num1 - 300 str1 = 'select "bytes_recv" , "bytes_sent" from net where time >= ' + str( num2) + 's and time <= ' + str(num1) + 's and bytes_recv != 0' temp = client.query(str1) test1 = temp.get_points() if len(temp) == 0: return jsonify({ "error code": 416, "error message": "Reading data is error" }), 416 # thelength = len(test) apache = [] apache2 = [] origin = [] apache1 = [] apache21 = [] origin1 = [] time_store = [] key = 'bytes_recv' key2 = 'bytes_sent' key3 = 'time' i = 0 ''' while i < thelength: apache.append(test[i][key]) apache2.append(test[i][key2]) time_store.append(test[i][key3]) if i!=0: train_req = float(apache[i])- float(apache[i-1]) train_sec = float(apache2[i]) - float(apache2[i-1]) origin.append([train_req/1024000,train_sec/1024000]) i += 1 ''' ''' while j < thelength1: apache1.append(test1[j][key]) apache21.append(test1[j][key2]) origin1.append([float(apache1[j])/1024,float(apache21[j])]) j += 1 ''' j = 0 for item in test1: apache1.append(item[u'bytes_recv']) apache21.append(item[u'bytes_sent']) time_store.append(item[u'time']) if j != 0: test_req = float(apache1[j]) - float(apache1[j - 1]) test_sec = float(apache21[j]) - float(apache21[j - 1]) origin1.append([test_req / 10240, test_sec / 10240]) j += 1 # train = np.array(origin) ceshi = np.array(origin1) #print(np.shape(ceshi)) rng = np.random.RandomState(42) clf = IsolationForest(max_samples=300, random_state=rng) clf.fit(ceshi) anomaly_score = clf.decision_function(ceshi) #print(anomaly_score) bad_domains = [] out_point1 = [] threshold = -0.15 i = 0 count = 0 for item in anomaly_score: if item < threshold: bad_domains.append(time_store[i]) out_point1.append(origin1[i]) count += 1 i += 1 out_point = np.zeros(shape=(len(out_point1), 2)) out_point = np.array(out_point1) #print(out_point) if len(out_point) != 0: b2 = plt.scatter(ceshi[:, 0], ceshi[:, 1], c='black', s=20, edgecolor='k') b1 = plt.scatter(out_point[:, 0], out_point[:, 1], c='red', s=20, edgecolor='k') xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.axis('tight') plt.xlim((-500, 500)) plt.ylim((-500, 500)) plt.legend([b2, b1], ["test data", "out point"], loc="upper left") else: b2 = plt.scatter(ceshi[:, 0], ceshi[:, 1], c='black', s=20, edgecolor='k') xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.axis('tight') plt.xlim((-500, 500)) plt.ylim((-500, 500)) filename = 'ioforest' + str(stamp) + ".png" #plt.savefig(filename) plt.savefig('maintenanceapp/static/' + filename) os.system("cd maintenanceapp/static && python test.py") if len(bad_domains) == 0: return jsonify({ 'count': count, 'time': bad_domains, 'filename': filename }) else: return jsonify({ 'count': count, 'time': bad_domains, 'filename': filename }) '''
class IsoForest(object): def __init__(self, dataset, n_estimators=100, max_samples='auto', contamination=0.1, **kwargs): # load dataset load_dataset(self, dataset) # initialize self.isoForest = None self.n_estimators = n_estimators self.max_samples = max_samples self.contamination = contamination self.initialize_isoForest(seed=self.data.seed, **kwargs) # train and test time self.clock = 0 self.clocked = 0 self.train_time = 0 self.test_time = 0 # Scores and AUC self.diag = {} self.diag['train'] = {} self.diag['val'] = {} self.diag['test'] = {} self.diag['train']['scores'] = np.zeros((len(self.data._y_train), 1)) self.diag['val']['scores'] = np.zeros((len(self.data._y_val), 1)) self.diag['test']['scores'] = np.zeros((len(self.data._y_test), 1)) self.diag['train']['auc'] = np.zeros(1) self.diag['val']['auc'] = np.zeros(1) self.diag['test']['auc'] = np.zeros(1) self.diag['train']['acc'] = np.zeros(1) self.diag['val']['acc'] = np.zeros(1) self.diag['test']['acc'] = np.zeros(1) # AD results log self.ad_log = AD_Log() # diagnostics self.best_weight_dict = None # attribute to reuse nnet plot-functions def initialize_isoForest(self, seed=0, **kwargs): self.isoForest = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, n_jobs=-1, random_state=seed, **kwargs) def load_data(self, data_loader=None, pretrain=False): self.data = data_loader() def start_clock(self): self.clock = time.time() def stop_clock(self): self.clocked = time.time() - self.clock print("Total elapsed time: %g" % self.clocked) def train(self): if self.data._X_train.ndim > 2: X_train_shape = self.data._X_train.shape X_train = self.data._X_train.reshape(X_train_shape[0], -1) else: X_train = self.data._X_train print("Starting training...") self.start_clock() self.isoForest.fit(X_train.astype(np.float32)) self.stop_clock() self.train_time = self.clocked def predict(self, which_set='train'): assert which_set in ('train', 'test') if which_set == 'train': X = self.data._X_train y = self.data._y_train if which_set == 'test': X = self.data._X_test y = self.data._y_test # reshape to 2D if input is tensor if X.ndim > 2: X_shape = X.shape X = X.reshape(X_shape[0], -1) print("Starting prediction...") self.start_clock() scores = (-1.0) * self.isoForest.decision_function( X.astype(np.float32)) # compute anomaly score y_pred = (self.isoForest.predict(X.astype(np.float32)) == -1) * 1 # get prediction self.diag[which_set]['scores'][:, 0] = scores.flatten() self.diag[which_set]['acc'][0] = 100.0 * sum(y == y_pred) / len(y) if sum(y) > 0: auc = roc_auc_score(y, scores.flatten()) self.diag[which_set]['auc'][0] = auc self.stop_clock() if which_set == 'test': self.test_time = self.clocked def dump_model(self, filename=None): dump_isoForest(self, filename) def load_model(self, filename=None): assert filename and os.path.exists(filename) load_isoForest(self, filename) def log_results(self, filename=None): """ log the results relevant for anomaly detection """ self.ad_log['train_auc'] = self.diag['train']['auc'][-1] self.ad_log['train_accuracy'] = self.diag['train']['acc'][-1] self.ad_log['train_time'] = self.train_time self.ad_log['test_auc'] = self.diag['test']['auc'][-1] self.ad_log['test_accuracy'] = self.diag['test']['acc'][-1] self.ad_log['test_time'] = self.test_time self.ad_log.save_to_file(filename=filename)
def main(): train_age = np.random.randint(18, 60, [1000, 1]) train_salary = np.random.randint(30, 90, [1000, 1]) #sex = np.random.randint(1,3,[100,1]) train = np.concatenate((train_age, train_salary), axis=1) test_age = np.random.randint(18, 60, [100, 1]) test_salary = np.random.randint(30, 90, [100, 1]) #sex = np.random.randint(1,3,[100,1]) test = np.concatenate((test_age, test_salary), axis=1) outliers_age = np.random.randint(1, 10, [100, 1]) outliers_salary = np.random.randint(10, 20, [100, 1]) #sex = np.random.randint(1,3,[100,1]) outliers = np.concatenate((outliers_age, outliers_salary), axis=1) outliers1_age = np.random.randint(61, 100, [100, 1]) outliers1_salary = np.random.randint(100, 200, [100, 1]) #sex = np.random.randint(1,3,[100,1]) outliers1 = np.concatenate((outliers1_age, outliers1_salary), axis=1) clf = IsolationForest(max_samples=100, contamination=0.01) clf.fit(train) Z = clf.predict(train) z_neg = np.zeros(shape=(1, 2)) for i in range(0, len(Z)): if (Z[i] < 0): z_neg = np.row_stack((z_neg, train[i])) z_neg = np.delete(z_neg, 0, axis=0) Z1 = clf.predict(test) z1_neg = np.zeros(shape=(1, 2)) for i in range(0, len(Z1)): if (Z1[i] < 0): z1_neg = np.row_stack((z1_neg, test[i])) z1_neg = np.delete(z1_neg, 0, axis=0) Z2 = clf.predict(outliers) z2_neg = np.zeros(shape=(1, 2)) for i in range(0, len(Z2)): if (Z2[i] < 0): z2_neg = np.row_stack((z2_neg, outliers[i])) z2_neg = np.delete(z2_neg, 0, axis=0) Z3 = clf.predict(outliers1) z3_neg = np.zeros(shape=(1, 2)) for i in range(0, len(Z3)): if (Z3[i] < 0): z3_neg = np.row_stack((z3_neg, outliers1[i])) z3_neg = np.delete(z3_neg, 0, axis=0) xx, yy = np.meshgrid(np.linspace(1, 100, 50), np.linspace(1, 200, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) b1 = plt.scatter(z_neg[:, 0], z_neg[:, 1], c='red') b2 = plt.scatter(z1_neg[:, 0], z1_neg[:, 1], c='green') c = plt.scatter(z2_neg[:, 0], z2_neg[:, 1], c='blue') d = plt.scatter(z3_neg[:, 0], z3_neg[:, 1], c='black') plt.show()
if i in [1, 2, 3, 8, 9, 10]: labels_val[index] = 1 # fit the model param = {} for max_features_ in range(1, 12): for contamination_ in np.arange(0, 0.2, 0.01): clf = IsolationForest(n_estimators=300, contamination=contamination_, max_samples=32, bootstrap=False, max_features=max_features_) clf.fit(X_train) val_score = clf.decision_function(X_val) x, y, threshold = roc_curve(labels_val, -val_score) a = auc(x, y) print('max_features:%s contamination:%s auc:%s' % (max_features_, contamination_, a)) param[a] = (max_features_, contamination_) best_model = list(param.keys())[np.argmax(list(param.keys()))] best_param = param[best_model] print('best:auc:%s contamination:%s max_features:%s' % (best_model, best_param[1], best_param[0])) clf = IsolationForest(n_estimators=300, contamination=best_param[1], max_samples=256,
def computeDiff_RF(ntrees=1024, sample_size_ratio=.33, alpha0=.1): # load data f = open('PKL/donnutsDataProblem.pkl', 'rb') [Xn, Xnt, Xa, Xb, Xab] = pickle.load(f) f.close() if sample_size_ratio >1: sample_size=sample_size_ratio else: sample_size=int(sample_size_ratio*len(Xn)) xn=Xn[:,0] yn=Xn[:,1] xa=Xa[:,0] ya=Xa[:,1] xb=Xb[:,0] yb=Xb[:,1] pathlib.Path('./FIG').mkdir(parents=True, exist_ok=True) # plotting the donnuts data plt.figure(1) plt.plot(xn, yn, 'bo', markersize=10) plt.savefig('FIG/clustersDonnuts0.pdf') nn=len(Xa) plt.figure(2) plt.plot(xn, yn, 'bo', xa[0:nn], ya[0:nn], 'rs') plt.savefig('FIG/clustersDonnuts1.pdf') plt.figure(3) plt.plot(xn, yn, 'bo', xa[0:nn], ya[0:nn], 'rs', xb[0:nn], yb[0:nn], 'gd') plt.xticks(size=14) plt.yticks(size=14) plt.savefig('FIG/clustersDonnuts2.pdf') # Creating Forest on normal data + anomalies labels print('building the Diff_RF ...') diff_rf = DiFF_TreeEnsemble(sample_size=sample_size, n_trees=ntrees) # load data fit_start = time.time() diff_rf.fit(Xn, n_jobs=8) fit_stop = time.time() fit_time = fit_stop - fit_start print(f"fit time {fit_time:3.2f}s") n_nodes = sum([t.n_nodes for t in diff_rf.trees]) print(f"{n_nodes} total nodes in {ntrees} trees") XT=np.concatenate([Xnt,Xab]) sc_di,sc_ff,sc_diff_rf = diff_rf.anomaly_score(XT,alpha=alpha0) sc_diff_rf=np.array(sc_diff_rf) sc_ff=np.array(sc_ff) sc_di=np.array(sc_di) sc_ff=(sc_ff-sc_ff.min())/(sc_ff.max()-sc_ff.min()) sc_di=(sc_di-sc_di.min())/(sc_di.max()-sc_di.min()) sc_diff_rf=(sc_diff_rf-sc_diff_rf.min())/(sc_diff_rf.max()-sc_diff_rf.min()) plt.figure(1000) xn=XT[:,0] yn=XT[:,1] plt.scatter(xn, yn, marker='o', c=sc_ff, cmap='viridis') plt.colorbar() plt.xticks(size=14) plt.yticks(size=14) plt.title('DiFF_RF (visiting frequency score) Heat Map') plt.savefig('FIG/HeatMap_DiFF_RF_freqScore.pdf') plt.figure(1001) xn=XT[:,0] yn=XT[:,1] plt.scatter(xn, yn, marker='o', c=sc_diff_rf, cmap='viridis') plt.colorbar() plt.xticks(size=14) plt.yticks(size=14) plt.title('DiFF_RF (collective anomaly score) Heat Map') plt.savefig('FIG/HeatMap_DiFF_RF_collectiveScore.pdf') plt.figure(1002) xn=XT[:,0] yn=XT[:,1] plt.scatter(xn, yn, marker='o', c=(sc_di), cmap='viridis') plt.colorbar() plt.xticks(size=14) plt.yticks(size=14) plt.title('DiFF_RF (point-wise anomaly score) Heat Map') plt.savefig('FIG/HeatMap_DiFF_RF_pointWiseScore.pdf') cif = IsolationForest(n_estimators=ntrees, max_samples=sample_size, bootstrap=False, n_jobs=12) cif.fit(Xn) sc_if = -cif.decision_function(XT) sc_if=(sc_if-sc_if.min())/(sc_if.max()-sc_if.min()) plt.figure(1003) xn=XT[:,0] yn=XT[:,1] plt.scatter(xn, yn, marker='o', c=sc_if, cmap='viridis') plt.colorbar() plt.xticks(size=14) plt.yticks(size=14) plt.title('Isolation Forest Heat Map') plt.savefig('FIG/HeatMap_IF.pdf') plt.show() y_true = np.array([-1] * len(Xnt) + [1] * len(Xab)) fpr_IF, tpr_IF, thresholds = roc_curve(y_true, sc_if) aucIF=auc(fpr_IF, tpr_IF) fpr_D, tpr_D, thresholds = roc_curve(y_true, sc_di) aucD=auc(fpr_D, tpr_D) fpr_F, tpr_F, thresholds = roc_curve(y_true, sc_ff) aucF=auc(fpr_F, tpr_F) fpr_DF, tpr_DF, thresholds = roc_curve(y_true, sc_diff_rf) aucDF=auc(fpr_DF, tpr_DF) print("Isolation Forest AUC=", aucIF) print("DiFF_RF (point-wise anomaly score) AUC=", aucD) print("DiFF_RF (frequency of visit scoring only) AUC=", aucF) print("DiFF_RF (collective anomaly score) AUC=", aucDF)
n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) X = 0.3 * rng.randn(n_inliers // 2, 2) X_train = np.r_[X + 2, X - 2] # 正常样本 X_train = np.r_[X_train, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] # 正常样本加上异常样本 # fit the model clf = IsolationForest(max_samples=n_samples, random_state=rng, contamination=outliers_fraction) clf.fit(X_train) # y_pred_train = clf.predict(X_train) scores_pred = clf.decision_function(X_train) threshold = stats.scoreatpercentile( scores_pred, 100 * outliers_fraction) # 根据训练样本中异常样本比例,得到阈值,用于绘图 # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-7, 7, 50), np.linspace(-7, 7, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("IsolationForest") # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # 绘制异常点区域,值从最小的到阈值的那部分
lof = LocalOutlierFactor(n_neighbors=20) ocsvm = OneClassSVM() lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_test) print('LocalOutlierFactor processing...') lof = LocalOutlierFactor(n_neighbors=20) lof.fit(X_train) s_X_lof = lof.decision_function(X_test) print('OneClassSVM processing...') ocsvm = OneClassSVM() ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)]) s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) auc_iforest, em_iforest, amax_iforest = em(t, t_max, volume_support, s_unif_iforest,
def lookout(args): # Number of plots to choose BUDGET = args.budget time = -timer() # Load dataset if args.dataset == 0: full_df = pd.read_csv("HTRU_2.csv") else: full_df = pd.read_csv("HTRU_2_filtered.csv") # Isolate outliers and inliers # Points to later be drawn in BLACK inlier_df = full_df.loc[full_df['Class'] == 0] outlier_df = full_df.loc[full_df['Class'] == 1].reset_index().drop( ['index'], axis=1) # Remove target column, to not get mixed as a feature full_df.drop(columns=['Class'], inplace=True) inlier_df.drop(columns=['Class'], inplace=True) outlier_df.drop(columns=['Class'], inplace=True) # Get all available features and combine them 2 by 2 all_features = list(full_df.columns) feature_pairs = list(ncr(all_features, 2)) # Matrix with scores for all outliers on all feature-pair plots (row = plot, column = outlier) scores = None # Isolation Forest instance used to train and score outliers classifier = IF() for feature_pair in feature_pairs: # Model for current feature pair classifier.fit(full_df[list(feature_pair)]) scores = np.array([classifier.decision_function(outlier_df[list(feature_pair)]).tolist()]) if scores is None \ else np.append(scores, [classifier.decision_function(outlier_df[list(feature_pair)]).tolist()], axis=0) # In Isolation Forest, negative scores are considered outliers and positive scores inliers. Original range is [-0.5, 0.5] # To ensure greedy approximation optimality we must ensure non negative range, i.e, convert scores to [0,1] # To do this, we flip the sign (so negatives become positives and outliers actually have better scores) and add 0.5 transform_range = np.vectorize(lambda x: 0.5 - x) scores = transform_range(scores) # Plot selection using greedy heuristic approach (see paper for proof of near optimality) S = [] # Final plot selection while BUDGET > 0: # Only pairs that have not been selected already candidate_pairs = list(set(feature_pairs) - set(S)) candidate_pairs_marginal_gains = [] for candidate_pair in candidate_pairs: # Marginal gain of current feature pair candidate_pairs_marginal_gains.append( get_marginal_gain(S, candidate_pair, feature_pairs, scores, args)) # Get max marginal gain, its index and retrieve respective feature pair S.append(candidate_pairs[candidate_pairs_marginal_gains.index( max(candidate_pairs_marginal_gains))]) BUDGET = BUDGET - 1 time = time + timer() print("Final selection: {}".format(S)) print("Execution time: {0:.2f}s".format(time)) print("Incrimination: {}".format( get_incrimination(S, feature_pairs, scores))) # Actual Plotting # Tuple of (best_outliers, other_outliers) for each feature pair; IDS ONLY! MUST RETRIEVE FROM OUTLIER DATAFRAME outlier_points = [] # For each selected plot, obtain list of outliers that are best explained by that feature pair (to be drawn in RED) # Remaining outliers to be drawn in BLUE for feature_pair in S: feature_pair_row_idx = get_row_indices([feature_pair], feature_pairs)[0] outliers_max_plot_scores = np.max(scores, axis=0) feature_pair_plot_scores = scores[feature_pair_row_idx] # Returns boolean array checking if float values are close enough to be considered true score_comparison = np.isclose(feature_pair_plot_scores, outliers_max_plot_scores) # IDs (in outliers dataframe) of outliers best explained by this feature pair best_outliers_ids = list( map(lambda x: x[0], filter(lambda y: y[1], enumerate(score_comparison.tolist())))) # shape property is a fast, safe way to extract number of rows in dataframe (x-shape) remaining_outliers_ids = list( set(range(outlier_df.shape[0])) - set(best_outliers_ids)) outlier_points.append((best_outliers_ids, remaining_outliers_ids)) # Plotting the chosen features for feature_pair, outliers_p in zip(S, outlier_points): # Adding inliers plot_df = inlier_df.copy() plot_df = plot_df[list(feature_pair)] plot_df['class'] = 'inlier' plot_df['point_size'] = 25 # Other outliers other_outliers = outlier_df.iloc[outliers_p[1]] other_outliers = other_outliers[list(feature_pair)] other_outliers['class'] = 'other' other_outliers['point_size'] = 25 # Explained outliers best_outliers = outlier_df.iloc[outliers_p[0]] best_outliers = best_outliers[list(feature_pair)] best_outliers['class'] = 'best' best_outliers['point_size'] = 35 # Joining all the dataframes plot_df = plot_df.append(best_outliers) plot_df = plot_df.append(other_outliers) # Actual Plotting f, ax = plt.subplots(figsize=(6.5, 6.5)) sns.scatterplot(x=feature_pair[0], y=feature_pair[1], hue="class", size="point_size", palette=get_palette(plot_df), linewidth=1, legend='full', alpha=0.7, edgecolor='black', data=plot_df, ax=ax) plt.autoscale(True) # Saving plots if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) plt.savefig('%s/%s_%s.png' % (args.output_dir, *feature_pair))
X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # # training only on normal data: # X_train = X_train[y_train == 0] # y_train = y_train[y_train == 0] print('IsolationForest processing...') model = IsolationForest() tstart = time() model.fit(X_train) fit_time += time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower,the more normal predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) if predict_time + fit_time > max_time: raise TimeoutError f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0. precision_, recall_ = precision_recall_curve(y_test, scoring)[:2] # cluster: old version of scipy -> interpol1d needs sorted x_input arg_sorted = recall_.argsort() recall_ = recall_[arg_sorted]
import numpy as np import matplotlib.pyplot as plt #get_ipython().magic(u'matplotlib inline') # ### 1D # In[2]: isolation_forest = IsolationForest() data = np.concatenate( (np.random.normal(size=100), np.random.normal(loc=5., size=100))) isolation_forest.fit(data.reshape(-1, 1)) xx = np.linspace(-4, 10, 1000) plt.plot(xx, isolation_forest.decision_function(xx.reshape(-1, 1))) plt.hist(data, normed=True) # ### 2D # In[67]: X = np.random.randn(8000, 2) # In[68]: isolation_forest = IsolationForest(n_estimators=15) isolation_forest.fit(X) # In[70]:
def find_anomalies_with_shingles(dataset, data, window_size=5, skip_size=None, ad_type="ifor", normalize_trend=False, n_top=10, outliers_fraction=0.1, log_transform=False): """ Finds anomalous regions in time series using standard unsupervised detectors First the time series is chopped up into windows ('shingles'). Then, a standard anomaly detector is run. """ x = w = None n = 0 ts_data = data if log_transform: # log-transform now since the values are positive (in context of # many real-world datasets line airline); otherwise, values become # negative after de-trending ts_data = log_transform_series(ts_data, eps=1.0) if normalize_trend: # remove trend from series ts_data = difference_series(ts_data) ts = TSeries(ts_data, y=None) for x_, _, w in ts.get_shingles(window_size, skip_size=skip_size, batch_size=-1): x = np.reshape(x_, newshape=(x_.shape[0], -1)) n = x.shape[0] logger.debug("Total instances: %d" % n) # logger.debug("Windows:\n%s" % str(w)) if False: feature_ranges = get_sample_feature_ranges(x) logger.debug("feature_ranges:\n%s" % str(feature_ranges)) scores = None if ad_type == "ocsvm": ad = svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1) ad.fit(x) scores = -ad.decision_function(x).reshape((n, )) elif ad_type == "ifor": ad = IsolationForest(max_samples=min(256, x.shape[0]), contamination=outliers_fraction, random_state=None) ad.fit(x) scores = -ad.decision_function(x) elif ad_type == "lof": ad = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction) ad.fit(x) scores = -ad._decision_function(x) elif ad_type == "autoenc": n_hiddens = max(1, window_size // 2) ad = AutoencoderAnomalyDetector( n_inputs=x.shape[1], n_neurons=[300, n_hiddens, 300], normalize_scale=True, activations=[tf.nn.tanh, tf.nn.tanh, tf.nn.tanh, None]) ad.fit(x) scores = -ad.decision_function(x) top_anoms = np.argsort(-scores)[0:n_top] logger.debug("top scores (%s):\n%s\n%s" % (ad_type, str(top_anoms), str(scores[top_anoms]))) pdfpath = "temp/timeseries/timeseries_shingles_%s_w%d%s_%s.pdf" % \ (dataset, window_size, "" if not log_transform else "_log", ad_type) dp = DataPlotter(pdfpath=pdfpath, rows=2, cols=1) # plot the timeseries anomalies with the detrended series pl = dp.get_next_plot() pl.set_xlim([0, ts.samples.shape[0]]) pl.plot(np.arange(0, ts.samples.shape[0]), ts.samples, 'b-', linewidth=0.5) for i in top_anoms: if w[i] + window_size <= len(ts.samples): pl.plot(np.arange(w[i], w[i] + window_size), ts.samples[w[i]:(w[i] + window_size)], 'r-') if normalize_trend: # plot the original series with anomalous windows pl = dp.get_next_plot() pl.set_xlim([0, data.shape[0]]) pl.plot(np.arange(0, data.shape[0]), data, 'b-', linewidth=0.5) for i in top_anoms: if w[i] + window_size <= len(data): pl.plot(np.arange(w[i], w[i] + window_size), data[w[i]:(w[i] + window_size)], 'r-') dp.close()
if_auc_std = np.zeros(shape=(n_iterations, ), dtype=np.float32) sf_auc_std = np.zeros(shape=(n_iterations, ), dtype=np.float32) if_precison_std = np.zeros(shape=(n_iterations, ), dtype=np.float32) sf_precision_std = np.zeros(shape=(n_iterations, ), dtype=np.float32) if_recall_std = np.zeros(shape=(n_iterations, ), dtype=np.float32) sf_recall_std = np.zeros(shape=(n_iterations, ), dtype=np.float32) if_f1_std = np.zeros(shape=(n_iterations, ), dtype=np.float32) sf_f1_std = np.zeros(shape=(n_iterations, ), dtype=np.float32) # run for i in range(n_iterations): print(f'{dataset}, {i+1} / {n_iterations}') IF = IsolationForest() IF.fit(X_train, y_train) if_pred = IF.decision_function(X_test) if_auc[i] = roc_auc_score(y_test, if_pred) if_class_pred = np.ones_like(if_pred) if_class_pred[if_pred <= 0.0] = -1 if_precison[i] = precision_score(y_test, if_class_pred) if_recall[i] = recall_score(y_test, if_class_pred) if_f1[i] = f1_score(y_test, if_class_pred) sf = IsolationSimilarityForest(**params) sf.fit(X_train, y_train) sf_pred = sf.decision_function(X_test) sf_auc[i] = roc_auc_score(y_test, sf_pred) sf_class_pred = np.ones_like(sf_pred) sf_class_pred[sf_pred <= 0.0] = -1