def cof(X):
    contamination_factor = 0.1
    k = 20
    clf = COF(contamination=contamination_factor, n_neighbors=k)
    clf.fit(X)
    label = clf.labels_
    score = clf.decision_scores_
    threshold = clf.threshold_
    writeLabel(label)
    return
Exemplo n.º 2
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = COF(contamination=self.contamination)
        self.clf.fit(self.X_train)
def getOutlierCOF(dataset):
    '''
    @brief Function that executes COF algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    cof = COF()
    # Fits the data and obtains labels
    cof.fit(dataset)
    # Return labels
    return cof.labels_
Exemplo n.º 4
0
def aCOF(dataset, contamination, n_neighbors, name):
    algo = COF(contamination=contamination,
               n_neighbors=n_neighbors).fit(dataset)
    outlier_labels = algo.predict(dataset)
    outlier_index = where(outlier_labels == 1)
    outlier_values = dataset.iloc[outlier_index]
    number_of_outlier = len(outlier_values)
    plt.title(name, loc='center', fontsize=20)
    plt.scatter(dataset["P1"], dataset["P2"], color="b", s=65)
    plt.scatter(outlier_values["P1"], outlier_values["P2"], color="r")
    plt.figtext(
        0.7,
        0.91,
        'contamination = {}\nn_neighbors = {} \nnumber of outlier = {}'.format(
            contamination, n_neighbors, number_of_outlier),
        fontsize=9)
    plt.show()
Exemplo n.º 5
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Exemplo n.º 6
0
 def test_check_parameters(self):
     with assert_raises(ValueError):
         COF(contamination=0.1, n_neighbors=-1)
     with assert_raises(ValueError):
         COF(contamination=10., n_neighbors=5)
     with assert_raises(TypeError):
         COF(contamination=0.1, n_neighbors='not int')
     with assert_raises(TypeError):
         COF(contamination='not float', n_neighbors=5)
     cof_ = COF(contamination=0.1, n_neighbors=10000)
     cof_.fit(self.X_train)
     assert self.X_train.shape[0] > cof_.n_neighbors_
Exemplo n.º 7
0
    def __init__(self, *,
                 hyperparams: Hyperparams, #
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._clf = COF(contamination=hyperparams['contamination'],
                        n_neighbors=hyperparams['n_neighbors'],
                        )

        return
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]
Exemplo n.º 9
0
    def runMethod(self):
        '''
        @brief This function is the actual implementation of HICS
        '''
        if self.verbose:
            print("Calculating the subspaces\n")
        # First we obtain the high contrast subspaces
        subspaces = self.hicsFramework()

        if self.verbose:
            print("Now calculating the scoring\n")
        # We initialize the scores for each instance as 0
        scores = np.zeros(len(self.dataset))
        # For each subspace
        for sub in subspaces:
            # We place the corresponding scorer according to parameter
            scorer = None
            if self.outlier_rank == "lof":
                scorer = LOF()
            elif self.outlier_rank == "cof":
                scorer = COF()
            elif self.outlier_rank == "cblof":
                scorer = CBLOF()
            elif self.outlier_rank == "loci":
                scorer = LOCI()
            elif self.outlier_rank == "hbos":
                scorer = HBOS()
            elif self.outlier_rank == "sod":
                scorer = SOD()
            # Fits the scorer with the dataset projection
            scorer.fit(self.dataset[:, sub])
            # Adds the scores obtained to the global ones
            scores = scores + scorer.decision_scores_
        # Compute the average
        self.outlier_score = scores / len(subspaces)
        # Marks the calculations as done
        self.calculations_done = True
Exemplo n.º 10
0
    'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
    'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE',
    'AutoEncoder'
]

models = {
    'BRM': BRM(),
    'GM': GaussianMixture(),
    'IF': IsolationForest(),
    'OCSVM': OneClassSVM(),
    'EE': EllipticEnvelope(),
    'AvgKNN': KNN(method='mean'),
    'LargestKNN': KNN(method='largest'),
    'MedKNN': KNN(method='median'),
    'PCA': PCA(),
    'COF': COF(),
    'LODA': LODA(),
    'LOF': LOF(),
    'HBOS': HBOS(),
    'MCD': MCD(),
    'AvgBagging': FeatureBagging(combination='average'),
    'MaxBagging': FeatureBagging(combination='max'),
    'IForest': IForest(),
    'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
    'COPOD': COPOD(),
    'SOD': SOD(),
    'LSCPwithLODA': LSCP([LODA(), LODA()]),
    'AveLMDD': LMDD(dis_measure='aad'),
    'VarLMDD': LMDD(dis_measure='var'),
Exemplo n.º 11
0
def get_detectors():
    # randomness_flags = []
    BASE_ESTIMATORS = [
        LODA(n_bins=5, n_random_cuts=10),
        LODA(n_bins=5, n_random_cuts=20),
        LODA(n_bins=5, n_random_cuts=30),
        LODA(n_bins=5, n_random_cuts=40),
        LODA(n_bins=5, n_random_cuts=50),
        LODA(n_bins=5, n_random_cuts=75),
        LODA(n_bins=5, n_random_cuts=100),
        LODA(n_bins=5, n_random_cuts=150),
        LODA(n_bins=5, n_random_cuts=200),
        LODA(n_bins=10, n_random_cuts=10),
        LODA(n_bins=10, n_random_cuts=20),
        LODA(n_bins=10, n_random_cuts=30),
        LODA(n_bins=10, n_random_cuts=40),
        LODA(n_bins=10, n_random_cuts=50),
        LODA(n_bins=10, n_random_cuts=75),
        LODA(n_bins=10, n_random_cuts=100),
        LODA(n_bins=10, n_random_cuts=150),
        LODA(n_bins=10, n_random_cuts=200),
        LODA(n_bins=15, n_random_cuts=10),
        LODA(n_bins=15, n_random_cuts=20),
        LODA(n_bins=15, n_random_cuts=30),
        LODA(n_bins=15, n_random_cuts=40),
        LODA(n_bins=15, n_random_cuts=50),
        LODA(n_bins=15, n_random_cuts=75),
        LODA(n_bins=15, n_random_cuts=100),
        LODA(n_bins=15, n_random_cuts=150),
        LODA(n_bins=15, n_random_cuts=200),
        LODA(n_bins=20, n_random_cuts=10),
        LODA(n_bins=20, n_random_cuts=20),
        LODA(n_bins=20, n_random_cuts=30),
        LODA(n_bins=20, n_random_cuts=40),
        LODA(n_bins=20, n_random_cuts=50),
        LODA(n_bins=20, n_random_cuts=75),
        LODA(n_bins=20, n_random_cuts=100),
        LODA(n_bins=20, n_random_cuts=150),
        LODA(n_bins=20, n_random_cuts=200),
        LODA(n_bins=25, n_random_cuts=10),
        LODA(n_bins=25, n_random_cuts=20),
        LODA(n_bins=25, n_random_cuts=30),
        LODA(n_bins=25, n_random_cuts=40),
        LODA(n_bins=25, n_random_cuts=50),
        LODA(n_bins=25, n_random_cuts=75),
        LODA(n_bins=25, n_random_cuts=100),
        LODA(n_bins=25, n_random_cuts=150),
        LODA(n_bins=25, n_random_cuts=200),
        LODA(n_bins=30, n_random_cuts=10),
        LODA(n_bins=30, n_random_cuts=20),
        LODA(n_bins=30, n_random_cuts=30),
        LODA(n_bins=30, n_random_cuts=40),
        LODA(n_bins=30, n_random_cuts=50),
        LODA(n_bins=30, n_random_cuts=75),
        LODA(n_bins=30, n_random_cuts=100),
        LODA(n_bins=30, n_random_cuts=150),
        LODA(n_bins=30, n_random_cuts=200),
        ABOD(n_neighbors=3),
        ABOD(n_neighbors=5),
        ABOD(n_neighbors=10),
        ABOD(n_neighbors=15),
        ABOD(n_neighbors=20),
        ABOD(n_neighbors=25),
        ABOD(n_neighbors=50),
        ABOD(n_neighbors=60),
        ABOD(n_neighbors=75),
        ABOD(n_neighbors=80),
        ABOD(n_neighbors=90),
        ABOD(n_neighbors=100),
        IForest(n_estimators=10, max_features=0.1),
        IForest(n_estimators=10, max_features=0.2),
        IForest(n_estimators=10, max_features=0.3),
        IForest(n_estimators=10, max_features=0.4),
        IForest(n_estimators=10, max_features=0.5),
        IForest(n_estimators=10, max_features=0.6),
        IForest(n_estimators=10, max_features=0.7),
        IForest(n_estimators=10, max_features=0.8),
        IForest(n_estimators=10, max_features=0.9),
        IForest(n_estimators=20, max_features=0.1),
        IForest(n_estimators=20, max_features=0.2),
        IForest(n_estimators=20, max_features=0.3),
        IForest(n_estimators=20, max_features=0.4),
        IForest(n_estimators=20, max_features=0.5),
        IForest(n_estimators=20, max_features=0.6),
        IForest(n_estimators=20, max_features=0.7),
        IForest(n_estimators=20, max_features=0.8),
        IForest(n_estimators=20, max_features=0.9),
        IForest(n_estimators=30, max_features=0.1),
        IForest(n_estimators=30, max_features=0.2),
        IForest(n_estimators=30, max_features=0.3),
        IForest(n_estimators=30, max_features=0.4),
        IForest(n_estimators=30, max_features=0.5),
        IForest(n_estimators=30, max_features=0.6),
        IForest(n_estimators=30, max_features=0.7),
        IForest(n_estimators=30, max_features=0.8),
        IForest(n_estimators=30, max_features=0.9),
        IForest(n_estimators=40, max_features=0.1),
        IForest(n_estimators=40, max_features=0.2),
        IForest(n_estimators=40, max_features=0.3),
        IForest(n_estimators=40, max_features=0.4),
        IForest(n_estimators=40, max_features=0.5),
        IForest(n_estimators=40, max_features=0.6),
        IForest(n_estimators=40, max_features=0.7),
        IForest(n_estimators=40, max_features=0.8),
        IForest(n_estimators=40, max_features=0.9),
        IForest(n_estimators=50, max_features=0.1),
        IForest(n_estimators=50, max_features=0.2),
        IForest(n_estimators=50, max_features=0.3),
        IForest(n_estimators=50, max_features=0.4),
        IForest(n_estimators=50, max_features=0.5),
        IForest(n_estimators=50, max_features=0.6),
        IForest(n_estimators=50, max_features=0.7),
        IForest(n_estimators=50, max_features=0.8),
        IForest(n_estimators=50, max_features=0.9),
        IForest(n_estimators=75, max_features=0.1),
        IForest(n_estimators=75, max_features=0.2),
        IForest(n_estimators=75, max_features=0.3),
        IForest(n_estimators=75, max_features=0.4),
        IForest(n_estimators=75, max_features=0.5),
        IForest(n_estimators=75, max_features=0.6),
        IForest(n_estimators=75, max_features=0.7),
        IForest(n_estimators=75, max_features=0.8),
        IForest(n_estimators=75, max_features=0.9),
        IForest(n_estimators=100, max_features=0.1),
        IForest(n_estimators=100, max_features=0.2),
        IForest(n_estimators=100, max_features=0.3),
        IForest(n_estimators=100, max_features=0.4),
        IForest(n_estimators=100, max_features=0.5),
        IForest(n_estimators=100, max_features=0.6),
        IForest(n_estimators=100, max_features=0.7),
        IForest(n_estimators=100, max_features=0.8),
        IForest(n_estimators=100, max_features=0.9),
        IForest(n_estimators=150, max_features=0.1),
        IForest(n_estimators=150, max_features=0.2),
        IForest(n_estimators=150, max_features=0.3),
        IForest(n_estimators=150, max_features=0.4),
        IForest(n_estimators=150, max_features=0.5),
        IForest(n_estimators=150, max_features=0.6),
        IForest(n_estimators=150, max_features=0.7),
        IForest(n_estimators=150, max_features=0.8),
        IForest(n_estimators=150, max_features=0.9),
        IForest(n_estimators=200, max_features=0.1),
        IForest(n_estimators=200, max_features=0.2),
        IForest(n_estimators=200, max_features=0.3),
        IForest(n_estimators=200, max_features=0.4),
        IForest(n_estimators=200, max_features=0.5),
        IForest(n_estimators=200, max_features=0.6),
        IForest(n_estimators=200, max_features=0.7),
        IForest(n_estimators=200, max_features=0.8),
        IForest(n_estimators=200, max_features=0.9),
        KNN(n_neighbors=1, method='largest'),
        KNN(n_neighbors=5, method='largest'),
        KNN(n_neighbors=10, method='largest'),
        KNN(n_neighbors=15, method='largest'),
        KNN(n_neighbors=20, method='largest'),
        KNN(n_neighbors=25, method='largest'),
        KNN(n_neighbors=50, method='largest'),
        KNN(n_neighbors=60, method='largest'),
        KNN(n_neighbors=70, method='largest'),
        KNN(n_neighbors=80, method='largest'),
        KNN(n_neighbors=90, method='largest'),
        KNN(n_neighbors=100, method='largest'),
        KNN(n_neighbors=1, method='mean'),
        KNN(n_neighbors=5, method='mean'),
        KNN(n_neighbors=10, method='mean'),
        KNN(n_neighbors=15, method='mean'),
        KNN(n_neighbors=20, method='mean'),
        KNN(n_neighbors=25, method='mean'),
        KNN(n_neighbors=50, method='mean'),
        KNN(n_neighbors=60, method='mean'),
        KNN(n_neighbors=70, method='mean'),
        KNN(n_neighbors=80, method='mean'),
        KNN(n_neighbors=90, method='mean'),
        KNN(n_neighbors=100, method='mean'),
        KNN(n_neighbors=1, method='median'),
        KNN(n_neighbors=5, method='median'),
        KNN(n_neighbors=10, method='median'),
        KNN(n_neighbors=15, method='median'),
        KNN(n_neighbors=20, method='median'),
        KNN(n_neighbors=25, method='median'),
        KNN(n_neighbors=50, method='median'),
        KNN(n_neighbors=60, method='median'),
        KNN(n_neighbors=70, method='median'),
        KNN(n_neighbors=80, method='median'),
        KNN(n_neighbors=90, method='median'),
        KNN(n_neighbors=100, method='median'),
        LOF(n_neighbors=1, metric='manhattan'),
        LOF(n_neighbors=5, metric='manhattan'),
        LOF(n_neighbors=10, metric='manhattan'),
        LOF(n_neighbors=15, metric='manhattan'),
        LOF(n_neighbors=20, metric='manhattan'),
        LOF(n_neighbors=25, metric='manhattan'),
        LOF(n_neighbors=50, metric='manhattan'),
        LOF(n_neighbors=60, metric='manhattan'),
        LOF(n_neighbors=70, metric='manhattan'),
        LOF(n_neighbors=80, metric='manhattan'),
        LOF(n_neighbors=90, metric='manhattan'),
        LOF(n_neighbors=100, metric='manhattan'),
        LOF(n_neighbors=1, metric='euclidean'),
        LOF(n_neighbors=5, metric='euclidean'),
        LOF(n_neighbors=10, metric='euclidean'),
        LOF(n_neighbors=15, metric='euclidean'),
        LOF(n_neighbors=20, metric='euclidean'),
        LOF(n_neighbors=25, metric='euclidean'),
        LOF(n_neighbors=50, metric='euclidean'),
        LOF(n_neighbors=60, metric='euclidean'),
        LOF(n_neighbors=70, metric='euclidean'),
        LOF(n_neighbors=80, metric='euclidean'),
        LOF(n_neighbors=90, metric='euclidean'),
        LOF(n_neighbors=100, metric='euclidean'),
        LOF(n_neighbors=1, metric='minkowski'),
        LOF(n_neighbors=5, metric='minkowski'),
        LOF(n_neighbors=10, metric='minkowski'),
        LOF(n_neighbors=15, metric='minkowski'),
        LOF(n_neighbors=20, metric='minkowski'),
        LOF(n_neighbors=25, metric='minkowski'),
        LOF(n_neighbors=50, metric='minkowski'),
        LOF(n_neighbors=60, metric='minkowski'),
        LOF(n_neighbors=70, metric='minkowski'),
        LOF(n_neighbors=80, metric='minkowski'),
        LOF(n_neighbors=90, metric='minkowski'),
        LOF(n_neighbors=100, metric='minkowski'),
        HBOS(n_bins=5, alpha=0.1),
        HBOS(n_bins=5, alpha=0.2),
        HBOS(n_bins=5, alpha=0.3),
        HBOS(n_bins=5, alpha=0.4),
        HBOS(n_bins=5, alpha=0.5),
        HBOS(n_bins=10, alpha=0.1),
        HBOS(n_bins=10, alpha=0.2),
        HBOS(n_bins=10, alpha=0.3),
        HBOS(n_bins=10, alpha=0.4),
        HBOS(n_bins=10, alpha=0.5),
        HBOS(n_bins=20, alpha=0.1),
        HBOS(n_bins=20, alpha=0.2),
        HBOS(n_bins=20, alpha=0.3),
        HBOS(n_bins=20, alpha=0.4),
        HBOS(n_bins=20, alpha=0.5),
        HBOS(n_bins=30, alpha=0.1),
        HBOS(n_bins=30, alpha=0.2),
        HBOS(n_bins=30, alpha=0.3),
        HBOS(n_bins=30, alpha=0.4),
        HBOS(n_bins=30, alpha=0.5),
        HBOS(n_bins=40, alpha=0.1),
        HBOS(n_bins=40, alpha=0.2),
        HBOS(n_bins=40, alpha=0.3),
        HBOS(n_bins=40, alpha=0.4),
        HBOS(n_bins=40, alpha=0.5),
        HBOS(n_bins=50, alpha=0.1),
        HBOS(n_bins=50, alpha=0.2),
        HBOS(n_bins=50, alpha=0.3),
        HBOS(n_bins=50, alpha=0.4),
        HBOS(n_bins=50, alpha=0.5),
        HBOS(n_bins=75, alpha=0.1),
        HBOS(n_bins=75, alpha=0.2),
        HBOS(n_bins=75, alpha=0.3),
        HBOS(n_bins=75, alpha=0.4),
        HBOS(n_bins=75, alpha=0.5),
        HBOS(n_bins=100, alpha=0.1),
        HBOS(n_bins=100, alpha=0.2),
        HBOS(n_bins=100, alpha=0.3),
        HBOS(n_bins=100, alpha=0.4),
        HBOS(n_bins=100, alpha=0.5),
        OCSVM(nu=0.1, kernel="linear"),
        OCSVM(nu=0.2, kernel="linear"),
        OCSVM(nu=0.3, kernel="linear"),
        OCSVM(nu=0.4, kernel="linear"),
        OCSVM(nu=0.5, kernel="linear"),
        OCSVM(nu=0.6, kernel="linear"),
        OCSVM(nu=0.7, kernel="linear"),
        OCSVM(nu=0.8, kernel="linear"),
        OCSVM(nu=0.9, kernel="linear"),
        OCSVM(nu=0.1, kernel="poly"),
        OCSVM(nu=0.2, kernel="poly"),
        OCSVM(nu=0.3, kernel="poly"),
        OCSVM(nu=0.4, kernel="poly"),
        OCSVM(nu=0.5, kernel="poly"),
        OCSVM(nu=0.6, kernel="poly"),
        OCSVM(nu=0.7, kernel="poly"),
        OCSVM(nu=0.8, kernel="poly"),
        OCSVM(nu=0.9, kernel="poly"),
        OCSVM(nu=0.1, kernel="rbf"),
        OCSVM(nu=0.2, kernel="rbf"),
        OCSVM(nu=0.3, kernel="rbf"),
        OCSVM(nu=0.4, kernel="rbf"),
        OCSVM(nu=0.5, kernel="rbf"),
        OCSVM(nu=0.6, kernel="rbf"),
        OCSVM(nu=0.7, kernel="rbf"),
        OCSVM(nu=0.8, kernel="rbf"),
        OCSVM(nu=0.9, kernel="rbf"),
        OCSVM(nu=0.1, kernel="sigmoid"),
        OCSVM(nu=0.2, kernel="sigmoid"),
        OCSVM(nu=0.3, kernel="sigmoid"),
        OCSVM(nu=0.4, kernel="sigmoid"),
        OCSVM(nu=0.5, kernel="sigmoid"),
        OCSVM(nu=0.6, kernel="sigmoid"),
        OCSVM(nu=0.7, kernel="sigmoid"),
        OCSVM(nu=0.8, kernel="sigmoid"),
        OCSVM(nu=0.9, kernel="sigmoid"),
        COF(n_neighbors=3),
        COF(n_neighbors=5),
        COF(n_neighbors=10),
        COF(n_neighbors=15),
        COF(n_neighbors=20),
        COF(n_neighbors=25),
        COF(n_neighbors=50),
    ]

    # randomness_flags.extend([True] * 54)  # LODA
    # randomness_flags.extend([False] * 7)  # ABOD
    # randomness_flags.extend([True] * 81)  # IForest
    # randomness_flags.extend([False] * 36)  # KNN
    # randomness_flags.extend([False] * 36)  # LOF
    # randomness_flags.extend([False] * 40)  # HBOS
    # randomness_flags.extend([False] * 36)  # OCSVM
    # randomness_flags.extend([False] * 7)  # COF
    # return BASE_ESTIMATORS, randomness_flags
    return BASE_ESTIMATORS
Exemplo n.º 12
0
class TestCOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = COF(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, 'n_neighbors_')
                and self.clf.n_neighbors_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_scores = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)
        print(pred_ranks)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_check_parameters(self):
        with assert_raises(ValueError):
            COF(contamination=0.1, n_neighbors=-1)
        with assert_raises(ValueError):
            COF(contamination=10., n_neighbors=5)
        with assert_raises(TypeError):
            COF(contamination=0.1, n_neighbors='not int')
        with assert_raises(TypeError):
            COF(contamination='not float', n_neighbors=5)
        cof_ = COF(contamination=0.1, n_neighbors=10000)
        cof_.fit(self.X_train)
        assert self.X_train.shape[0] > cof_.n_neighbors_

    def tearDown(self):
        pass
Exemplo n.º 13
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
Exemplo n.º 14
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, X_test, y_train, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train COF detector
    clf_name = 'COF'
    clf = COF(n_neighbors=30)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
def compare(inputdata, labels, n_clusters, dset_name):
    """
    Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset
    Args:
        inputdata: input data
        labels: ground truth outlier labels
        n_clusters: number of clusters, for some cluster-based detectors
        dset_name: dataset

    Returns: AUC, Fgap, Frank

    """
    print(
        "Competing with conventional unsupervised outlier detection algorithms..."
    )
    random_state = np.random.RandomState(1)
    if inputdata.shape[1] < 64:
        AEneurons = [16, 8, 8, 16]
        VAEneurons = [16, 8, 4], [4, 8, 16]
    else:
        AEneurons = [64, 32, 32, 64]
        VAEneurons = [128, 64, 32], [32, 64, 128]

    classifiers = {
        'PCA':
        PCA(random_state=random_state),
        'AutoEncoder':
        AutoEncoder(batch_size=100,
                    hidden_neurons=AEneurons,
                    random_state=random_state),
        'VAE':
        VAE(batch_size=100,
            encoder_neurons=VAEneurons[0],
            decoder_neurons=VAEneurons[1],
            random_state=random_state),
        'COPOD':
        COPOD(),
        'Iforest':
        IForest(random_state=random_state),
        'AutoEncoder':
        AutoEncoder(batch_size=100, random_state=random_state),
        'VAE':
        VAE(batch_size=100, random_state=random_state),
        'LODA':
        LODA(),
        'OCSVM':
        OCSVM(),
        'ABOD':
        ABOD(n_neighbors=20),
        'Fb':
        FeatureBagging(random_state=random_state),
        'CBLOF':
        CBLOF(n_clusters=n_clusters,
              check_estimator=False,
              random_state=random_state),
        'LOF':
        LOF(),
        'COF':
        COF()
    }

    for clf_name, clf in classifiers.items():
        print(f"Using {clf_name} method")
        starttime = time.time()
        clf.fit(inputdata)
        time_taken = time.time() - starttime
        test_scores = clf.decision_scores_

        # -----fix some broken scores----- #
        for i in range(len(test_scores)):
            cur = test_scores[i]
            if np.isnan(cur) or not np.isfinite(cur):
                test_scores[i] = 0

        np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores)
        auc = roc_auc_score(labels, test_scores)
        print('AUC:', auc)
        fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy',
              f'{dset_name}/attribute.npy')
        print('time_taken:', time_taken)
    # Specify the root directory
    datasets_path = "Anomaly_Datasets_csv"
    rootDir = os.path.abspath(datasets_path)
    # specify the random state
    rs = 10
    # Save how to run the models

    detector_list = [LOF(), LOF()]

    models = [
        # BRM github
        (brminer.BRM(), 'BRM'),
        # ocSVM sklearn
        (OneClassSVM(gamma='auto'), 'ocSVM'),
        # COF pyod
        (COF(contamination=0.1, n_neighbors=20), 'COF'),
        # ABOD pyod
        (ABOD(contamination=0.1, n_neighbors=5, method='fast'), 'ABOD'),
        # MO_GAAL pyod
        (MO_GAAL(k=10,
                 stop_epochs=20,
                 lr_d=0.01,
                 lr_g=0.0001,
                 decay=1e-06,
                 momentum=0.9,
                 contamination=0.1), 'MO_GAAL'),
        # SO_GAAL pyod
        (SO_GAAL(stop_epochs=20,
                 lr_d=0.01,
                 lr_g=0.0001,
                 decay=1e-06,
    #equipment_hist = sorted_dataset[['Equipment Name','date']].groupby('Equipment Name').count().plot.barh()

    #plt.plot(data_hist['Inspection Date'],data_hist['date'])
    #plt.show()
    #print(sorted_dataset.to_string())

    sliced_data = sorted_dataset[[
        'PD Average', 'PD Count', 'Temperature', 'Humidity', 'Loading'
    ]]

    print(sorted_dataset.loc[sorted_dataset['Confirm action'] ==
                             '2'].to_string())

    clfs = [
        ABOD(contamination=.01),
        COF(contamination=.01),
        CBLOF(contamination=.01),
        IForest(contamination=.01)
    ]

    anomalies = []

    for clf in clfs:
        clf.fit(sliced_data)
        y_train_pred = clf.labels_
        sorted_dataset['Anomaly_status'] = y_train_pred
        anomalies.extend(sorted_dataset.loc[sorted_dataset['Anomaly_status'] ==
                                            1].index.values.tolist())
        print("Completed:" + clf.__class__.__name__)

    anomaly_counter = Counter(anomalies)
Exemplo n.º 18
0
    def initialise_pyod_classifiers(self, outlier_fraction):
        #Testing every query to every class and then predicting only if it belongs to the same class
        classifiers = {}
        #Proximity based
        classifiers['K Nearest Neighbors (KNN)'] = []
        classifiers['Average K Nearest Neighbors (AvgKNN)'] = []
        classifiers['Median K Nearest Neighbors (MedKNN)'] = []
        classifiers['Local Outlier Factor (LOF)'] = []
        classifiers['Connectivity-Based Outlier Factor (COF)'] = []
        #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = []
        classifiers['LOCI'] = []
        #classifiers['Histogram-based Outlier Score (HBOS)'] = []
        classifiers['Subspace Outlier Detection (SOD)'] = []
        #Linear models
        classifiers['Principal Component Analysis (PCA)'] = []
        #classifiers['Minimum Covariance Determinant (MCD)'] = []           #To slow
        classifiers['One-Class Support Vector Machines (OCSVM)'] = []
        classifiers['Deviation-based Outlier Detection (LMDD)'] = []
        #Probabilistic
        classifiers['Angle-Based Outlier Detection (ABOD)'] = []
        classifiers['Stochastic Outlier Selection (SOS)'] = []
        #Outlier Ensembles
        classifiers['Isolation Forest (IForest)'] = []
        classifiers['Feature Bagging'] = []
        classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = []

        for i in range(self.k_way):
            for i in range(self.k_way):
                classifiers['K Nearest Neighbors (KNN)'].append(
                    KNN(method='largest',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Average K Nearest Neighbors (AvgKNN)'].append(
                    KNN(method='mean',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Median K Nearest Neighbors (MedKNN)'].append(
                    KNN(method='median',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Local Outlier Factor (LOF)'].append(
                    LOF(n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Connectivity-Based Outlier Factor (COF)'].append(
                    COF(n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['LOCI'].append(
                    LOCI(contamination=outlier_fraction))
                classifiers['Subspace Outlier Detection (SOD)'].append(
                    SOD(n_neighbors=int(self.n_shot / 3) + 2,
                        contamination=outlier_fraction,
                        ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3))))
                classifiers['Principal Component Analysis (PCA)'].append(
                    PCA(contamination=outlier_fraction))
                classifiers[
                    'One-Class Support Vector Machines (OCSVM)'].append(
                        OCSVM(contamination=outlier_fraction))
                classifiers['Deviation-based Outlier Detection (LMDD)'].append(
                    LMDD(contamination=outlier_fraction))
                classifiers['Angle-Based Outlier Detection (ABOD)'].append(
                    ABOD(contamination=outlier_fraction))
                classifiers['Stochastic Outlier Selection (SOS)'].append(
                    SOS(contamination=outlier_fraction))
                classifiers['Isolation Forest (IForest)'].append(
                    IForest(contamination=outlier_fraction))
                classifiers['Feature Bagging'].append(
                    FeatureBagging(contamination=outlier_fraction))
                classifiers[
                    'Lightweight On-line Detector of Anomalies (LODA)'].append(
                        LODA(contamination=outlier_fraction))
        self.num_different_models = len(classifiers)
        return classifiers
    f.write("Model: " + modelname + "\n")
    f.write("Dataset " + str(datasetnumber) + ": " + datasetname + "\n")
    f.write("Time taken: " + str(time) + " seg.\n")
    f.write("Accuracy: " + str(accuracy) + "\n")
    if accuracy!=None:
        f.write("@scores\n")
        for score in model.decision_scores_:
            f.write(str(score) + "\n")
    f.close()

# This is based on executing the script from the folder experiments
ROUTE = "../datasets/outlier_ground_truth/"
# List of datasets
datasets = ["annthyroid.mat", "arrhythmia.mat", "breastw.mat", "cardio.mat", "glass.mat", "ionosphere.mat", "letter.mat", "lympho.mat", "mammography.mat", "mnist.mat", "musk.mat", "optdigits.mat", "pendigits.mat", "pima.mat", "satellite.mat", "satimage-2.mat", "speech.mat", "thyroid.mat", "vertebral.mat", "vowels.mat", "wbc.mat", "wine.mat"]
# List of models and names
models = [ABOD(), COF(), HBOS(), KNN(), LOF(), MCD(), OCSVM(), PCA(), SOD(), SOS()]
names = ["ABOD", "COF", "HBOS", "KNN", "LOF", "MCD", "OCSVM", "PCA", "SOD", "SOS"]
accuracies = []

for name, model in zip(names, models):
    print("\n\n#################################################################")
    print("MODEL " + name + " " + str(names.index(name)+1) + "/" + str(len(names)))
    print("#################################################################")
    acc = []
    for dat in datasets:
        if name=="ABOD" and dat in ["breastw.mat", "letter.mat", "satellite.mat"]:
            result = None
        else:
            print("Computing dataset " + dat + " " + str(datasets.index(dat)+1) + "/" + str(len(datasets)))
            # Read dataset
            dataset, labels = readDataset(ROUTE + dat)
Exemplo n.º 20
0
        # 60% data for training and 40% for testing
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state)

        # standardizing data for processing
        #X_train_norm, X_test_norm = standardizer(X_train, X_test)
        X_norm = normalizeData(X)
        print(np.shape(X))
        print(np.shape(X_norm))

        classifiers = {
            'Local Outlier Factor (LOF)': LOF(
                contamination=outliers_fraction
            ),
            'Connectivity-Based Outlier Factor (COF)': COF(
                contamination=outliers_fraction
            ),
            'K Nearest Neighbors (KNN)': KNN(
                contamination=outliers_fraction
            ),
            'Average K Nearest Neighbors (AvgKNN)': KNN(
                method='mean',
                contamination=outliers_fraction
            ),
            'Median K Nearest Neighbors (MedKNN)': KNN(
                method='median',
                contamination=outliers_fraction
            ),
            'Subspace Outlier Detection (SOD)': SOD(
                contamination=outliers_fraction
            ) 
Exemplo n.º 21
0
def cof(n_neighbors, contamination, name):
    dataset = prepare_data(df_names[0])
    clf = COF(n_neighbors=n_neighbors, contamination=contamination).fit_predict(dataset)
    outlier_index = np.where(clf == 1)
    outlier_plot(dataset, outlier_index, contamination, n_neighbors, name)
    outlier_remove(outlier_index, 'df_without_outliers_cof.csv')
# 返回训练数据X_train上的异常标签和异常分值
y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常)

# 用训练好的clf来预测未知数据中的异常值
y_test_pred = clf.predict(new_origin_all[pos:]) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) 
y_test_scores = clf.decision_function(new_origin_all[pos:]) # 返回未知数据上的异常值

show_scatter(clf_name, df, y_train_pred, pos)


# In[170]:


clf_name = 'COF'
clf = COF(n_neighbors=30)
clf.fit(new_origin_all[:pos])

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(new_origin_all[pos:])  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(new_origin_all[pos:])  # outlier scores

show_scatter(clf_name, df, y_train_pred, pos)


# In[171]: