示例#1
0
    def test_data_generate_cluster5(self):
        with assert_raises(ValueError):
            X_train, y_train, X_test, y_test = \
                generate_data_clusters(n_train=self.n_train,
                                       n_test=self.n_test,
                                       n_features=3,
                                       n_clusters='e',
                                       contamination=self.contamination,
                                       random_state=self.random_state)

        with assert_raises(ValueError):
            X_train, y_train, X_test, y_test = \
                generate_data_clusters(n_train=self.n_train,
                                       n_test=self.n_test,
                                       n_features='e',
                                       contamination=self.contamination,
                                       random_state=self.random_state)

        with assert_raises(ValueError):
            X_train, y_train, X_test, y_test = \
                generate_data_clusters(n_train=self.n_train,
                                       n_test=self.n_test,
                                       n_features=3,
                                       contamination='e',
                                       random_state=self.random_state)

        with assert_raises(ValueError):
            X_train, y_train, X_test, y_test = \
                generate_data_clusters(n_train=self.n_train,
                                       n_test=self.n_test,
                                       n_features=3,
                                       contamination=self.contamination,
                                       dist='e',
                                       random_state=self.random_state)
示例#2
0
    def test_data_generate_cluster3(self):
        X_train, y_train, X_test, y_test = \
            generate_data_clusters(n_train=self.n_train,
                                   n_test=self.n_test,
                                   n_features=3,
                                   contamination=self.contamination,
                                   random_state=self.random_state)

        X_train2, y_train2, X_test2, y_test2 = \
            generate_data_clusters(n_train=self.n_train,
                                   n_test=self.n_test,
                                   n_features=3,
                                   contamination=self.contamination,
                                   random_state=self.random_state)

        assert_allclose(X_train, X_train2)
        assert_allclose(X_test, X_test2)
        assert_allclose(y_train, y_train2)
        assert_allclose(y_test, y_test2)
示例#3
0
    def test_data_generate_cluster2(self):
        X_train, X_test, y_train, y_test = \
            generate_data_clusters(n_train=self.n_train,
                                   n_test=self.n_test,
                                   n_features=4,
                                   contamination=self.contamination,
                                   random_state=self.random_state)

        assert_allclose(X_train.shape, (self.n_train, 4))
        assert_allclose(X_test.shape, (self.n_test, 4))
示例#4
0
文件: test_gmm.py 项目: yzhao062/pyod
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.n_components = 4
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data_clusters(
            n_train=self.n_train,
            n_test=self.n_test,
            n_clusters=self.n_components,
            contamination=self.contamination,
            random_state=42,
        )

        self.clf = GMM(n_components=self.n_components, contamination=self.contamination)
        self.clf.fit(self.X_train)
示例#5
0
    def test_data_generate_cluster(self):
        X_train, X_test, y_train, y_test = \
            generate_data_clusters(n_train=self.n_train,
                                   n_test=self.n_test,
                                   n_features=2,
                                   contamination=self.contamination,
                                   random_state=self.random_state)

        assert_equal(y_train.shape[0], X_train.shape[0])
        assert_equal(y_test.shape[0], X_test.shape[0])

        assert (self.n_train - X_train.shape[0] <= 1)
        assert_equal(X_train.shape[1], 2)

        assert (self.n_test - X_test.shape[0] <= 1)
        assert_equal(X_test.shape[1], 2)

        out_perc = (np.sum(y_train) + np.sum(y_test)) / (
                self.n_train + self.n_test)
        assert_allclose(self.contamination, out_perc, atol=0.01)
示例#6
0
# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), "..")))

if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, X_test, y_train, y_test = generate_data_clusters(
        n_train=n_train,
        n_test=n_test,
        n_features=2,
        n_clusters=4,
        contamination=contamination,
        random_state=42,
    )

    # train kNN detector
    clf_name = "GMM"
    clf = GMM(n_components=4)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
from pyod.models.lof import LOF
from pyod.utils.data import generate_data_clusters
from pyod.utils.example import data_visualize
from pyod.utils.example import visualize
from pyod.utils.data import evaluate_print

if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers

    # Generate sample data in clusters
    X, y = generate_data_clusters(n_train=450,
                                  n_test=50,
                                  n_clusters=3,
                                  n_features=2,
                                  contamination=contamination,
                                  size='different',
                                  density='different',
                                  dist=0.2,
                                  random_state=42,
                                  return_in_clusters=True)

    # visualize the results
    data_visualize(X, y, show_figure=True, save_figure=False)

    # test on the generated datasets

    # Generate sample data in clusters
    X_train, X_test, y_train, y_test = generate_data_clusters(
        n_train=450,
        n_test=50,
        n_clusters=3,