def perform_outlier_detection(self, X, len_priors): # LOF on all features scores = dict() for key, value in X.iteritems(): if key == 'user': clf = IsolationForest() clf.fit(value) scores[key] = clf.decision_function(value) else: clf = LocalOutlierFactor(n_neighbors=20) clf.fit(value) check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"]) if value is not None: value = check_array(value, accept_sparse='csr') scores[key] = clf._decision_function(value) else: scores[key] = clf.negative_outlier_factor_ with open('clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f: for domain, all_scores in scores.iteritems(): for item in all_scores: f.write(str(item) + ',') f.write('\n') combined_scores = self.combine(scores) scores = None new_scores = combined_scores[len_priors:] user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True) threshold = np.percentile(new_scores, 95) outliers = [u[0] for u in user_scores if u[1] >= threshold] return outliers
def perform_outlier_detection_all_combos(self, X, len_priors): # LOF on all features scores = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}} print "Starting anomaly detection loop" for key, value in X.iteritems(): if key == 'user': continue print key clf = IsolationForest() clf.fit(value) scores[key]['iforest'] = clf.decision_function(value) print "Finished iforest" clf = LocalOutlierFactor(n_neighbors=20) clf.fit(value) scores[key]['lof'] = clf._decision_function(value) print "Finished anomaly detection loop" with open('clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f: for domain, value in scores.iteritems(): for type_score, all_scores in value.iteritems(): f.write(domain + ' ' + type_score + ',') for item in all_scores: f.write(str(item) + ',') f.write('\n') combined_scores = self.combine_all(scores) scores = None new_scores = combined_scores[len_priors:] user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True) threshold = np.percentile(new_scores, 8) outliers = [u[0] for u in user_scores if u[1] <= threshold] return outliers
def find_anomalies_with_shingles(ts, window_size=5, skip_size=None, ad_type="ifor", n_top=10, outliers_fraction=0.1): """ Finds anomalous regions in time series using standard unsupervised detectors First the time series is chopped up into windows ('shingles'). Then, a standard anomaly detector is run. """ x = w = None n = 0 for x_, _, w in ts.get_shingles(window_size, skip_size=skip_size, batch_size=-1): x = np.reshape(x_, newshape=(x_.shape[0], -1)) n = x.shape[0] logger.debug("Total instances: %d" % n) # logger.debug("Windows:\n%s" % str(w)) if False: feature_ranges = get_sample_feature_ranges(x) logger.debug("feature_ranges:\n%s" % str(feature_ranges)) scores = None if ad_type == "ocsvm": ad = svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1) ad.fit(x) scores = -ad.decision_function(x).reshape((n, )) elif ad_type == "ifor": ad = IsolationForest(max_samples=256, contamination=outliers_fraction, random_state=None) ad.fit(x) scores = -ad.decision_function(x) elif ad_type == "lof": ad = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction) ad.fit(x) scores = -ad._decision_function(x) elif ad_type == "autoenc": n_hiddens = max(1, window_size // 2) ad = AutoencoderAnomalyDetector( n_inputs=x.shape[1], n_neurons=[300, n_hiddens, 300], normalize_scale=True, activations=[tf.nn.tanh, tf.nn.tanh, tf.nn.tanh, None]) ad.fit(x) scores = -ad.decision_function(x) top_anoms = np.argsort(-scores)[0:n_top] logger.debug("top scores (%s):\n%s\n%s" % (ad_type, str(top_anoms), str(scores[top_anoms]))) pdfpath = "temp/timeseries/timeseries_shingles_w%d_%s.pdf" % (window_size, ad_type) dp = DataPlotter(pdfpath=pdfpath, rows=3, cols=1) pl = dp.get_next_plot() pl.set_xlim([0, ts.samples.shape[0]]) pl.plot(np.arange(0, ts.samples.shape[0]), ts.samples, 'b-', linewidth=0.5) for i in top_anoms: if w[i] + window_size <= len(ts.samples): pl.plot(np.arange(w[i], w[i] + window_size), ts.samples[w[i]:(w[i] + window_size)], 'r-') dp.close()
x_grid = np.c_[xx.ravel(), yy.ravel()] if ad_type == "ocsvm": ad = svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1) ad.fit(x) scores = -ad.decision_function(x).reshape((n,)) Z = -ad.decision_function(x_grid) elif ad_type == "ifor": ad = IsolationForest(max_samples=256, contamination=outliers_fraction, random_state=None) ad.fit(x) scores = -ad.decision_function(x) Z = -ad.decision_function(x_grid) elif ad_type == "lof": ad = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction) ad.fit(x) scores = -ad._decision_function(x) Z = -ad._decision_function(x_grid) elif ad_type == "loda": ad = Loda(mink=100, maxk=200) ad.fit(x) scores = -ad.decision_function(x) Z = -ad.decision_function(x_grid) logger.debug("scores:\n%s" % str(list(scores))) top_anoms = np.argsort(-scores)[np.arange(10)] if args.plot: # plot_samples_and_lines(x, lines=None, line_colors=None, line_legends=None, # top_anoms=top_anoms, pdfpath="temp/%s_%soutlier.pdf" % (ad_type, sample_type)) Z = Z.reshape(xx.shape) pdfpath = "temp/ad_%scontours_%s.pdf" % (sample_type, ad_type)
if clustering_model_type == 'SVM': model = svm.OneClassSVM(nu=model_param[0], kernel="rbf", gamma="auto") model.fit(X) score = model.decision_function(Z) score = [s[0] for s in score] elif clustering_model_type == 'IF': model = IsolationForest(max_samples=n_samples, contamination=model_param[0], random_state=rng) model.fit(X) score = model.decision_function(Z) elif clustering_model_type == 'LOF': model = LocalOutlierFactor(n_neighbors=model_param[1], contamination=model_param[0]) model.fit_predict(X) score = model._decision_function(Z) # Save Z Z_with_word = pd.DataFrame(list(zip(Z.index, score))) Z_with_word = Z_with_word.sort_values(by=1, ascending=False) name = ( dist + '/clustering' + '-' + ('50' if w2v_param == 0 else '200') + '-' + str(count_threshold) + '-' + a_type + '-' + clustering_model_type + '-' + str(model_param[0]) + str('' if len(model_param) == 1 else '-' + str(model_param[1]))) print('save file. name is "' + name + '"') Z_with_word.to_csv(name, header=None, index=False)
def find_anomalies_with_shingles(dataset, data, window_size=5, skip_size=None, ad_type="ifor", normalize_trend=False, n_top=10, outliers_fraction=0.1, log_transform=False): """ Finds anomalous regions in time series using standard unsupervised detectors First the time series is chopped up into windows ('shingles'). Then, a standard anomaly detector is run. """ x = w = None n = 0 ts_data = data if log_transform: # log-transform now since the values are positive (in context of # many real-world datasets line airline); otherwise, values become # negative after de-trending ts_data = log_transform_series(ts_data, eps=1.0) if normalize_trend: # remove trend from series ts_data = difference_series(ts_data) ts = TSeries(ts_data, y=None) for x_, _, w in ts.get_shingles(window_size, skip_size=skip_size, batch_size=-1): x = np.reshape(x_, newshape=(x_.shape[0], -1)) n = x.shape[0] logger.debug("Total instances: %d" % n) # logger.debug("Windows:\n%s" % str(w)) if False: feature_ranges = get_sample_feature_ranges(x) logger.debug("feature_ranges:\n%s" % str(feature_ranges)) scores = None if ad_type == "ocsvm": ad = svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1) ad.fit(x) scores = -ad.decision_function(x).reshape((n, )) elif ad_type == "ifor": ad = IsolationForest(max_samples=min(256, x.shape[0]), contamination=outliers_fraction, random_state=None) ad.fit(x) scores = -ad.decision_function(x) elif ad_type == "lof": ad = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction) ad.fit(x) scores = -ad._decision_function(x) elif ad_type == "autoenc": n_hiddens = max(1, window_size // 2) ad = AutoencoderAnomalyDetector( n_inputs=x.shape[1], n_neurons=[300, n_hiddens, 300], normalize_scale=True, activations=[tf.nn.tanh, tf.nn.tanh, tf.nn.tanh, None]) ad.fit(x) scores = -ad.decision_function(x) top_anoms = np.argsort(-scores)[0:n_top] logger.debug("top scores (%s):\n%s\n%s" % (ad_type, str(top_anoms), str(scores[top_anoms]))) pdfpath = "temp/timeseries/timeseries_shingles_%s_w%d%s_%s.pdf" % \ (dataset, window_size, "" if not log_transform else "_log", ad_type) dp = DataPlotter(pdfpath=pdfpath, rows=2, cols=1) # plot the timeseries anomalies with the detrended series pl = dp.get_next_plot() pl.set_xlim([0, ts.samples.shape[0]]) pl.plot(np.arange(0, ts.samples.shape[0]), ts.samples, 'b-', linewidth=0.5) for i in top_anoms: if w[i] + window_size <= len(ts.samples): pl.plot(np.arange(w[i], w[i] + window_size), ts.samples[w[i]:(w[i] + window_size)], 'r-') if normalize_trend: # plot the original series with anomalous windows pl = dp.get_next_plot() pl.set_xlim([0, data.shape[0]]) pl.plot(np.arange(0, data.shape[0]), data, 'b-', linewidth=0.5) for i in top_anoms: if w[i] + window_size <= len(data): pl.plot(np.arange(w[i], w[i] + window_size), data[w[i]:(w[i] + window_size)], 'r-') dp.close()