示例#1
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        detectors = [LOF(), LOF()]

        self.clf = LSCP(base_estimators=detectors)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.6
示例#2
0
contamination = 0.1  # percentage of outliers
n_train = 200  # number of training points
n_test = 100  # number of testing points

# Generate sample data
X_train, y_train, X_test, y_test = \
    generate_data(n_train=n_train,
                  n_test=n_test,
                  n_features=2,
                  contamination=contamination,
                  random_state=42)

detectors = [KNN(), LOF(), OCSVM()]

clf_name = 'LSCP'
clf = LSCP(base_estimators=detectors)
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print('Average', y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print('Average', y_test, y_test_scores)
示例#3
0
class TestLSCP(unittest.TestCase):
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        detectors = [LOF(), LOF()]

        self.clf = LSCP(base_estimators=detectors)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.6

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, proba_method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, proba_method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, proba_method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def tearDown(self):
        pass
示例#4
0
    def fit(self, X):
        """
        Fit individual detectors.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The RD profile of all segments generated after preprocessing.

        Returns
        -------
        self : object
             Fitted estimator.
        """
        X = check_array(X)

        # normalization of all segments with Z-score
        scale_X = scale(X)

        # all base detectors with default parameters
        detectors = [LOF(), SO_GAAL(), IForest(), HBOS(), CBLOF()]

        # record results for individual detectors
        self.scores_base_ = np.zeros((len(scale_X), len(detectors)))
        self.labels_base_ = np.zeros((len(scale_X), len(detectors)))

        # record results for all merging strategies
        self.scores_ = np.zeros((len(scale_X), len(self.scores_comb)))
        self.labels_ = np.zeros((len(scale_X), len(self.scores_comb)))

        for i in range(len(detectors)):
            clf = detectors[i].fit(scale_X)
            self.scores_base_[:, i] = clf.decision_function(scale_X)

            # obtain a series of binary labels using the BCM
            _npat = BCM(X=scale_X,
                        is_require_X=self.is_require_X,
                        bandwidth=self.bandwidth)
            _npat.fit(self.scores_base_[:, i].reshape(-1, 1))
            self.labels_base_[:, i] = _npat.labels_

        # normalization of all outlier score vectors with Z-score
        _scale_score = scale(self.scores_base_)

        for i in range(len(self.scores_comb)):
            if self.scores_comb[i] == "voting":  # majority_vote
                self.scores_[:, i] = np.array([np.nan] * len(scale_X))
                self.labels_[:, i] = np.array(
                    [statistics.mode(j) for j in self.labels_base_])

            elif self.scores_comb[i] == "maximum":
                # the maximum of five outlier scores for each segment
                self.scores_[:, i] = np.max(_scale_score, axis=1)

                # obtain binary labels with BCM
                _npat = BCM(X=scale_X,
                            is_require_X=self.is_require_X,
                            bandwidth=self.bandwidth)
                _npat.fit(self.scores_[:, i].reshape(-1, 1))
                self.labels_[:, i] = _npat.labels_

            elif self.scores_comb[i] == "lscp":
                clf = LSCP(detectors, pre_fitted=True)
                clf.fit(scale_X)
                self.scores_[:, i] = clf.decision_function(scale_X)

                # obtain binary labels with the BCM
                _npat = BCM(X=scale_X,
                            is_require_X=self.is_require_X,
                            bandwidth=self.bandwidth)
                _npat.fit(self.scores_[:, i].reshape(-1, 1))
                self.labels_[:, i] = _npat.labels_

            elif self.scores_comb[i] == "averaging":
                self.scores_[:, i] = np.mean(_scale_score, axis=1)

                # obtain binary labels with the BCM
                _npat = BCM(X=scale_X,
                            is_require_X=self.is_require_X,
                            bandwidth=self.bandwidth)
                _npat.fit(self.scores_[:, i].reshape(-1, 1))
                self.labels_[:, i] = _npat.labels_