def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = EllipticEnvelope(contamination=0.2).fit(X_train) clf2 = EllipticEnvelope().fit(X_train) assert_array_equal(clf1.score_samples([[2., 2.]]), clf1.decision_function([[2., 2.]]) + clf1.offset_) assert_array_equal(clf2.score_samples([[2., 2.]]), clf2.decision_function([[2., 2.]]) + clf2.offset_) assert_array_equal(clf1.score_samples([[2., 2.]]), clf2.score_samples([[2., 2.]]))
class Baseline(ModelBase): def __init__(self, model_name, packet_length=1500, seq_length=1, epochs=1): super().__init__(packet_length, seq_length, epochs) self.model_name = model_name if model_name == 'svm': self.model = OneClassSVM(kernel='rbf', nu=0.05) elif model_name == 'if': self.model = IsolationForest(contamination=0.05, max_features=15, random_state=0) elif model_name == 'lof': self.model = LocalOutlierFactor(contamination=0.05, novelty=True) elif model_name == 'gm': self.model = GaussianMixture(random_state=0) elif model_name == 'ee': self.model = EllipticEnvelope(contamination=0.05, random_state=0) def fit(self, X): self.model.fit(X) def predict(self, X): labels = self.model.predict(X) scores = self.model.score_samples(X) return scores, labels def save(self, name): joblib.dump(self.model, name + '_{}.pkl'.format(self.model_name)) def load(self, name): self.model = joblib.load(name + '_{}.pkl'.format(self.model_name)) def exist(self, name): return os.path.exists(name + '_{}.pkl'.format(self.model_name))
def schedule(self, event_input_name, event_input_value, data_from_pickle, X_predict, X_train, y_train, store_precision, assume_centered, support_fraction, contamination, random_state): if event_input_name == 'INIT': return [ event_input_value, None, self.classifier, self.prediction, self.score ] elif event_input_name == 'RUN': if data_from_pickle == None: # default values or not if store_precision is not None: self.store_precision = store_precision if assume_centered is not None: self.assume_centered = assume_centered if support_fraction is not None: self.support_fraction = support_fraction if contamination is not None: self.contamination = contamination if random_state is not None: self.random_state = random_state classif = EllipticEnvelope() classif.fit( np.array(X_train).astype(np.float64), np.array(y_train).astype(np.float64)) self.classifier = classif return [ None, event_input_value, self.classifier, self.prediction, self.score ] else: classif = data_from_pickle self.classifier = classif self.prediction = classif.predict( np.array(X_predict).astype(np.float64).reshape(1, -1)) self.score = classif.score_samples( np.array(X_predict).astype(np.float64).reshape(1, -1)) return [ None, event_input_value, self.classifier, self.prediction, self.score ]
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal(scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert (sum(y_pred == -1) == sum(decisions < 0))
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal( scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert(sum(y_pred == -1) == sum(decisions < 0))
def main(): ''' to be run if script is called directly ''' # define a normal distribution that roughly spans -1 to 1 mu = 0.0 sigma = 0.35 # create some ellipse-like data, using that distribution n_points = 2500 a = 3.0 # semi-major axis b = 1.0 # semi-minor axis x = a * np.random.normal(mu, sigma, n_points) y = b * np.random.normal(mu, sigma, n_points) # load up the x and y points into an n-by-2 array points = np.vstack((x, y)).T # apply a constant-angle rotation to the data theta_deg = -13 theta = np.pi * theta_deg / 180.0 rotation_matrix = compute_rotation_matrix(theta) points = np.dot(points, rotation_matrix) # apply a shift to the data point in the x and y directions x_shift = 5 y_shift = -5 points += [x_shift, y_shift] # pull out the x and y values again as lists, for demonstration purposes x = list(points[:, 0]) y = list(points[:, 1]) # fit a confidence ellipse to the data print('\n - fitting a confidence ellipse to the data...') confidence = 0.95 ellipse_info = fit_ellipse(x, y, confidence_interval=confidence) # [user input] create a new point to test print('\n - running a new point through the ellipse...') new_points = [(-5, -5), (5, 5), (5, -5)] # quantiatively see if the point falls within the ellipse or not results = use_ellipse(new_points, ellipse_info, visualize_process=True, verbose=True, plots_directory='outlierness_plots') # print a summary note about the results inlier_counts = results['inlier'].value_counts() print('\n\t - of the', len(results), 'points passed in, there are', inlier_counts[True], 'inliers and', inlier_counts[False], 'outliers') # fit a scikit-learn gaussian elliptic envelope to the data print('\n - fitting a scikit-learn gaussian ellipse to the data...') detector = EllipticEnvelope(contamination=1 - confidence) detector.fit(points) # run the new point through the detector print('\n - running a new point through the scikit-learn ellipse...') new_points = np.array(new_points).reshape(-1, 2) inlier_sk = detector.predict(new_points) mahalanobis_score = detector.score_samples(new_points) print('\n\t - inlier:', inlier_sk) print('\t - mahalanobis score:', mahalanobis_score) print('\n\tN.B. Although the Mahalanobis distances (a.k.a. "scores") ' + \ '\n\tcomputed by scikit-learn do provide a statistically ' + \ '\n\tmeaningful metric of how far away from the center of the ' + \ '\n\tellipse a point lies, it doesn\'t provide any information ' + \ '\n\tabout whether the point is an inlier or an outlier! So, it ' + \ '\n\tmakes more sense to just use my implementation and the ' + \ '\n\t"outlierness" metric, which spans [-1, inf): postive values ' +\ '\n\timply outliers, negative values imply inliers, and a value ' + \ '\n\tof -1 corresponds to the center of the ellipse.\n')