예제 #1
0
def anomaly_detection(features, labels):
	# In this function, I try to use anomaly detection method (using mutivariate gaussian distribution) to identify poi-s
	non_pois = features[labels==0]
	pois = features[labels==1]
	print "non poi size", non_pois.shape, pois.shape, features.shape

	## Spliting data to train, test and cross validation set for anomaly detection

	split1 = produce_spliting_array(non_pois.shape[0], .75 )
	X_train = non_pois[split1==1]

	X_intermediate = non_pois[split1==0]

	print "size intermediate", X_intermediate.shape

	split2 = produce_spliting_array(X_intermediate.shape[0], .5 )

	X_test = X_intermediate[split2==1]
	label_test = np.zeros((X_test.shape[0],), dtype=np.int) - 1

	X_cv = X_intermediate[split2==0]
	label_cv = np.zeros((X_cv.shape[0],), dtype=np.int) - 1

	split3 = produce_spliting_array(pois.shape[0], .5 )
	X_test = np.vstack((X_test, pois[split3==1]))
	label_test = np.hstack((label_test, np.ones(sum(split3), dtype=np.int)))

	X_cv = np.vstack((X_cv, pois[split3==0]))
	label_cv = np.hstack((label_cv, np.ones(sum(split3==0), dtype=np.int)))



	print "size X_train", X_train.shape
	print "size test data", X_test.shape, label_test.shape
	print "size cv data", X_cv.shape, label_cv.shape
	print "size splits", len(split1), len(split2), len(split3)

	from sklearn.covariance import EllipticEnvelope
	detector = EllipticEnvelope(contamination=.85)
	detector.fit(X_train)
	pred_cv = detector.predict(X_cv)
	print pred_cv
	print label_cv
	print detector.score(X_cv, label_cv)
예제 #2
0
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True),
                              clf.mahalanobis(X - clf.location_))
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
def test_outlier_detection():
    """

    """
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_))
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    decision = clf.decision_function(X, raw_values=True)
    decision_transformed = clf.decision_function(X, raw_values=False)

    assert_array_almost_equal(decision, clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
    assert sum(y_pred == -1) == sum(decision_transformed < 0)
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    decision = clf.decision_function(X, raw_values=True)
    decision_transformed = clf.decision_function(X, raw_values=False)

    assert_array_almost_equal(decision, clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert (sum(y_pred == -1) == sum(decision_transformed < 0))
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert (sum(y_pred == -1) == sum(decisions < 0))
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(
        scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert(sum(y_pred == -1) == sum(decisions < 0))