예제 #1
0
def test_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
    clf2 = EllipticEnvelope().fit(X_train)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf1.decision_function([[2., 2.]]) + clf1.offset_)
    assert_array_equal(clf2.score_samples([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]) + clf2.offset_)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf2.score_samples([[2., 2.]]))
예제 #2
0
class Baseline(ModelBase):
    def __init__(self, model_name, packet_length=1500, seq_length=1, epochs=1):
        super().__init__(packet_length, seq_length, epochs)
        self.model_name = model_name
        if model_name == 'svm':
            self.model = OneClassSVM(kernel='rbf', nu=0.05)
        elif model_name == 'if':
            self.model = IsolationForest(contamination=0.05,
                                         max_features=15,
                                         random_state=0)
        elif model_name == 'lof':
            self.model = LocalOutlierFactor(contamination=0.05, novelty=True)
        elif model_name == 'gm':
            self.model = GaussianMixture(random_state=0)
        elif model_name == 'ee':
            self.model = EllipticEnvelope(contamination=0.05, random_state=0)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        labels = self.model.predict(X)
        scores = self.model.score_samples(X)
        return scores, labels

    def save(self, name):
        joblib.dump(self.model, name + '_{}.pkl'.format(self.model_name))

    def load(self, name):
        self.model = joblib.load(name + '_{}.pkl'.format(self.model_name))

    def exist(self, name):
        return os.path.exists(name + '_{}.pkl'.format(self.model_name))
    def schedule(self, event_input_name, event_input_value, data_from_pickle,
                 X_predict, X_train, y_train, store_precision, assume_centered,
                 support_fraction, contamination, random_state):

        if event_input_name == 'INIT':

            return [
                event_input_value, None, self.classifier, self.prediction,
                self.score
            ]

        elif event_input_name == 'RUN':
            if data_from_pickle == None:
                # default values or not
                if store_precision is not None:
                    self.store_precision = store_precision
                if assume_centered is not None:
                    self.assume_centered = assume_centered
                if support_fraction is not None:
                    self.support_fraction = support_fraction
                if contamination is not None:
                    self.contamination = contamination
                if random_state is not None:
                    self.random_state = random_state

                classif = EllipticEnvelope()

                classif.fit(
                    np.array(X_train).astype(np.float64),
                    np.array(y_train).astype(np.float64))
                self.classifier = classif

                return [
                    None, event_input_value, self.classifier, self.prediction,
                    self.score
                ]

            else:
                classif = data_from_pickle
                self.classifier = classif
                self.prediction = classif.predict(
                    np.array(X_predict).astype(np.float64).reshape(1, -1))
                self.score = classif.score_samples(
                    np.array(X_predict).astype(np.float64).reshape(1, -1))

                return [
                    None, event_input_value, self.classifier, self.prediction,
                    self.score
                ]
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert (sum(y_pred == -1) == sum(decisions < 0))
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(
        scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert(sum(y_pred == -1) == sum(decisions < 0))
예제 #6
0
def main():
    '''
    to be run if script is called directly
    '''
    # define a normal distribution that roughly spans -1 to 1
    mu = 0.0
    sigma = 0.35

    # create some ellipse-like data, using that distribution
    n_points = 2500
    a = 3.0  # semi-major axis
    b = 1.0  # semi-minor axis
    x = a * np.random.normal(mu, sigma, n_points)
    y = b * np.random.normal(mu, sigma, n_points)

    # load up the x and y points into an n-by-2 array
    points = np.vstack((x, y)).T

    # apply a constant-angle rotation to the data
    theta_deg = -13
    theta = np.pi * theta_deg / 180.0

    rotation_matrix = compute_rotation_matrix(theta)
    points = np.dot(points, rotation_matrix)

    # apply a shift to the data point in the x and y directions
    x_shift = 5
    y_shift = -5
    points += [x_shift, y_shift]

    # pull out the x and y values again as lists, for demonstration purposes
    x = list(points[:, 0])
    y = list(points[:, 1])

    # fit a confidence ellipse to the data
    print('\n  - fitting a confidence ellipse to the data...')
    confidence = 0.95
    ellipse_info = fit_ellipse(x, y, confidence_interval=confidence)

    # [user input] create a new point to test
    print('\n  - running a new point through the ellipse...')
    new_points = [(-5, -5), (5, 5), (5, -5)]

    # quantiatively see if the point falls within the ellipse or not
    results = use_ellipse(new_points,
                          ellipse_info,
                          visualize_process=True,
                          verbose=True,
                          plots_directory='outlierness_plots')

    # print a summary note about the results
    inlier_counts = results['inlier'].value_counts()

    print('\n\t  - of the', len(results), 'points passed in, there are',
          inlier_counts[True], 'inliers and', inlier_counts[False], 'outliers')

    # fit a scikit-learn gaussian elliptic envelope to the data
    print('\n  - fitting a scikit-learn gaussian ellipse to the data...')
    detector = EllipticEnvelope(contamination=1 - confidence)
    detector.fit(points)

    # run the new point through the detector
    print('\n  - running a new point through the scikit-learn ellipse...')
    new_points = np.array(new_points).reshape(-1, 2)
    inlier_sk = detector.predict(new_points)
    mahalanobis_score = detector.score_samples(new_points)
    print('\n\t  - inlier:', inlier_sk)
    print('\t  - mahalanobis score:', mahalanobis_score)

    print('\n\tN.B. Although the Mahalanobis distances (a.k.a. "scores") ' + \
          '\n\tcomputed by scikit-learn do provide a statistically ' + \
          '\n\tmeaningful metric of how far away from the center of the ' + \
          '\n\tellipse a point lies, it doesn\'t provide any information ' + \
          '\n\tabout whether the point is an inlier or an outlier! So, it ' + \
          '\n\tmakes more sense to just use my implementation and the ' + \
          '\n\t"outlierness" metric, which spans [-1, inf): postive values ' +\
          '\n\timply outliers, negative values imply inliers, and a value ' + \
          '\n\tof -1 corresponds to the center of the ellipse.\n')