Exemplo n.º 1
0
 def __init__(self,
              n_clusters=2,
              n_repeat=10,
              *anomaly_detector_params0,
              **anomaly_detector_params1):
     self.n_clusters = n_clusters
     self.n_repeat = n_repeat
     self.ad_parms0 = anomaly_detector_params0
     self.ad_parms1 = anomaly_detector_params1
     self.clf_ = None
     AnomalyDetector.__init__(self, *anomaly_detector_params0,
                              **anomaly_detector_params1)
Exemplo n.º 2
0
    def fit_anomaly_detector(self, X, max_k=2, n_repeat=10, scores=None, use_k=None, init_clusters=None, verbose=False, scoring_method=mean):
        '''
        The method uses a variation of the elbow curve to select the number of clusters based on the maximum individual
        anomaly score for each cluster computed by the make_scores method.
        The method is proposed at http://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve
        :param X: data set to fit, 2 dim numpy array or a DataObject with a class column
        :param n_repeat: repeat clustering n number of times using the mean as elbow curve
        :param scores: an array of arrays with anomaly scores, to use for fitting instead of calling make_scores. Each array corresponds contains 1 or more calls to make_scores for a k number of clusters. For each index i in range(len(scores)) means k=i+1.
        :param use_k: if set, no autmatic selection is made, instead the value of use_k is used as number of clusters
        :param init_clusters: the clustering is initialized with the provided clusters. scores, use_k and n_repeat are ignored.
        :param verbose: print progress info
        :param scoring_method: the method to compute the aggregated evaluation score from teh anomaly scores of the data clustering, e.g. numpy mean or std.
        :return: the anomaly detector with parameters provided in the constructor fitted to the data with the best number of clusters tested
        '''

        best_k = None
        if init_clusters is None:
            if use_k is None:
                ss = []
                if scores is None:
                    for n in xrange(n_repeat):
                        self.make_scores(X, max_k,verbose=verbose)
                        ss.append(map(lambda s: scoring_method(s[s<inf]), self.scores))
                else:
                    ss = [map(lambda s: scoring_method(s[s<inf]), scores[i]) for i in range(len(scores))]

                best_k, y = self.compute_best_elbow_k(ss)
            else:
                best_k = use_k

        ad_list = []
        scores = []
        for i in range(n_repeat):
            ad = AnomalyDetector(*self.ad_parms0, **self.ad_parms1)
            self.clustering_ = self._train_clf(ad, X, best_k,init_clusters,verbose=verbose)
            ad_scores = ad.anomaly_score(X, self.clustering_)
            scores.append(ad_scores[ad_scores < inf].std())
            ad_list.append(ad)
            if init_clusters is not None:
                break

        best_ad = ad_list[argmin(scores)]

        if verbose:
            print "best k", best_k

        if use_k or init_clusters is not None or y is not None:
            self.cluster_curve_ = None
        else:
            self.cluster_curve_ = y

        return ad
Exemplo n.º 3
0
    def fit_anomaly_detector(self, data_object, poisson_onesided=True):
        if poisson_onesided:
            anomaly_detector = AnomalyDetector([
                P_PoissonOnesided(self.root_column + i, self.period_column)
                for i in xrange(self.num_of_event_columns)
            ])
        else:
            anomaly_detector = AnomalyDetector([
                P_Poisson(self.root_column + i, self.period_column)
                for i in xrange(self.num_of_event_columns)
            ])

        self._anomaly_detector = anomaly_detector.fit(data_object)

        return anomaly_detector
Exemplo n.º 4
0
    def fit_anomaly_detector(self, data_object, poisson_onesided=True):
        if poisson_onesided:
            anomaly_detector = AnomalyDetector([
                                                   P_PoissonOnesided(self.root_column+i, self.period_column)
                                                   for i in xrange(self.num_of_event_columns)
                                                   ])
        else:
            anomaly_detector = AnomalyDetector([
                                                   P_Poisson(self.root_column+i, self.period_column)
                                                   for i in xrange(self.num_of_event_columns)
                                                   ])

        self._anomaly_detector = anomaly_detector.fit(data_object)

        return anomaly_detector
Exemplo n.º 5
0
    def make_scores(self,X, max_k, start_k=2,verbose=False):
        '''
        Returns an array of the individual anomaly scores for each example for number of each clusters.
        :param X: array of arrays or DataObject
        :param max_k: maximum number of clusters
        :param start_k: start clustering  start_k to max_k (inclusive) number of clusters.
        :return: array of array of anomaly scores for each k from 1 to max_k (inclusive)
        '''
        ad = AnomalyDetector(*self.ad_parms0, **self.ad_parms1)
        ad.fit(X)
        score = ad.anomaly_score(X)
        scores = [list(score)]

        if verbose:
            print "Clusters", 1, "Score", score[score < inf].std(), sum(score == inf)

        min_percentile = percentile(score,20)
        max_percentile = percentile(score,80)

        for k in range(start_k,max_k+1):
            clusters = self._train_clf(ad, X, k,verbose=verbose, marked_as_single_cluster=[s >= min_percentile and s <= max_percentile for s in score])

            score = ad.anomaly_score(X, clusters)
            scores += [list(score)]
            if verbose:
                print "Clusters", k, "Score",  score[score < inf].std(), sum(score == inf)

        self.scores = array(scores)

        return self.scores
    def test_conditional_gaussian_dependency_matrix(self):
        length = 100
        n_samples = 1000
        X = array([sample_markov_chain(length) for _ in range(n_samples)])

        # Next two should be equal
        s0 = AnomalyDetector(
            P_ConditionalGaussianDependencyMatrix(
                range(length), length)).fit(X).anomaly_score(X)

        ad1 = AnomalyDetector(
            P_ConditionalGaussianCombiner([
                P_ConditionalGaussian([i + 1], [i]) for i in range(length - 1)
            ] + [P_ConditionalGaussian([0], [])]), cr_plus).fit(X)
        s1 = ad1.anomaly_score(X)

        assert_allclose(s0, s1, rtol=0.0001)  # OK

        # Most likely, these two are not equal but highly correlated
        ad2 = AnomalyDetector(
            [P_ConditionalGaussian([i], []) for i in range(length)],
            cr_plus).fit(X)
        s2 = ad2.anomaly_score(X)

        ad3 = AnomalyDetector(
            P_ConditionalGaussianCombiner(
                [P_ConditionalGaussian([i], []) for i in range(length)]),
            cr_plus).fit(X)
        s3 = ad3.anomaly_score(X)

        assert_equal(pearsonr(s2, s3) > 0.985, True)

        # Test classification
        Y = array([sample_markov_chain(length, 0.2) for _ in range(n_samples)])
        Z = array([sample_markov_chain(length, 0.3) for _ in range(n_samples)])

        data = r_[X, Y, Z]
        labels = r_[['X'] * len(X), ['Y'] * len(Y), ['Z'] * len(Z)]

        data_index = shuffle(range(len(data)))
        training_set = data_index[:n_samples * 2]
        test_set = data_index[n_samples * 2:]

        models = {
            'independent gaussian':
            AnomalyDetector([P_Gaussian([i]) for i in range(length)], cr_plus),
            'independent conditional gaussian':
            AnomalyDetector(
                [P_ConditionalGaussian([i], []) for i in range(length)],
                cr_plus),
            'independent conditional gaussian with combiner':
            AnomalyDetector(
                P_ConditionalGaussianCombiner(
                    [P_ConditionalGaussian([i], []) for i in range(length)])),
            'single conditional gaussian with combiner':
            AnomalyDetector(
                P_ConditionalGaussianCombiner([
                    P_ConditionalGaussian([i], [i - 1])
                    for i in range(1, length)
                ] + [P_ConditionalGaussian([0], [])])),
            'dependency matrix':
            AnomalyDetector(
                P_ConditionalGaussianDependencyMatrix(range(length), length))
        }

        all_acc = {}
        for key in models:
            ad = models[key].fit(data[training_set], labels[training_set])

            adclf = SklearnClassifier.clf(ad)

            labels_predicted = adclf.predict(data[test_set])
            accuracy = sum(labels[test_set] == labels_predicted) / float(
                len(test_set))
            all_acc[key] = accuracy
            print key, "accuracy = ", accuracy

        assert_close(all_acc['independent gaussian'],
                     all_acc['independent conditional gaussian'],
                     decimal=2)
        assert_close(all_acc['independent gaussian'],
                     all_acc['independent conditional gaussian with combiner'],
                     decimal=2)
        assert_close(all_acc['single conditional gaussian with combiner'],
                     all_acc['dependency matrix'],
                     decimal=2)
Exemplo n.º 7
0
    def test_conditional_gaussian(self):
        x = array([[x0] for x0 in norm(0, 1).rvs(1000)])

        gauss_scores = AnomalyDetector(P_Gaussian(0)).fit(x).anomaly_score(x)
        condgauss_scores = \
            AnomalyDetector(P_ConditionalGaussian([0], [])). \
                fit(x). \
                anomaly_score(x)

        assert_allclose(gauss_scores, condgauss_scores, atol=0.01, rtol=0.01)

        X = array(
            [[x0, x1]
             for x0, x1 in zip(norm(0, 1).rvs(1000),
                               norm(0, 1).rvs(1000))])

        gauss_scores_X = AnomalyDetector(P_Gaussian(
            [0])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(P_ConditionalGaussian([0],[1])). \
                fit(X). \
                anomaly_score(X)

        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=0.3)

        X = array(
            [[x0, x0 + 0.1 * x1]
             for x0, x1 in zip(norm(0, 1).rvs(1000),
                               norm(0, 1).rvs(1000))])

        # This is not equal at all
        gauss_scores_X = AnomalyDetector(P_Gaussian(
            [0, 1])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(P_ConditionalGaussian([0,1],[])). \
                fit(X). \
                anomaly_score(X)

        assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.994),
                     True)
        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=2)  # Very bad

        X = array([[x0, x0 + 0.1 * x1, x2]
                   for x0, x1, x2 in c_[norm(0, 1).rvs(1000),
                                        norm(0, 1).rvs(1000),
                                        norm(0, 1).rvs(1000)]])

        # This is not equal at all
        gauss_scores_X = AnomalyDetector(P_Gaussian(
            [0, 1])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(P_ConditionalGaussian([0, 1], [])). \
                fit(X). \
                anomaly_score(X)

        assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.994),
                     True)
        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=2)  # Very bad

        X = array([[x0, x0 + 0.1 * x1, x2]
                   for x0, x1, x2 in c_[norm(0, 1).rvs(1000),
                                        norm(0, 1).rvs(1000),
                                        norm(0, 1).rvs(1000)]])

        # This is not equal at all
        gauss_scores_X = AnomalyDetector(P_Gaussian(
            [0, 1, 2])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(
                P_ConditionalGaussianCombiner([
                    P_ConditionalGaussian([0], [1,2]),
                    P_ConditionalGaussian([1], [2]),
                    P_ConditionalGaussian([2], []),
                ])). \
                fit(X). \
                anomaly_score(X)

        assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.98),
                     True)
        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=5)  # Very bad

        # This is very much equal
        gauss_scores_X = AnomalyDetector(P_ConditionalGaussian(
            [0, 1, 2], [])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(
                P_ConditionalGaussianCombiner([
                    P_ConditionalGaussian([0], [1, 2]),
                    P_ConditionalGaussian([1], [2]),
                    P_ConditionalGaussian([2], []),
                ])). \
                fit(X). \
                anomaly_score(X)

        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=0.001)

        # If we combine them using a ordinary combination rule by adding anomaly score together
        condgauss_scores_X2 = \
        AnomalyDetector(
            [
                P_ConditionalGaussian([0], [1, 2]),
                P_ConditionalGaussian([1], [2]),
                P_ConditionalGaussian([2], []),
            ], cr_plus). \
            fit(X). \
            anomaly_score(X)

        assert_equal(
            (pearsonr(condgauss_scores_X, condgauss_scores_X2) > 0.99),
            True)  # Good

        assert_allclose(condgauss_scores_X2, condgauss_scores_X, atol=2)  # Bad

        #
        ad1 = AnomalyDetector([P_Gaussian([i]) for i in range(len(X[0]))],
                              cr_plus).fit(X)
        s1 = ad1.anomaly_score(X)

        ad2 = AnomalyDetector(
            [P_ConditionalGaussian([i], []) for i in range(len(X[0]))],
            cr_plus).fit(X)
        s2 = ad2.anomaly_score(X)

        print "r:", pearsonr(s1, s2)

        assert_allclose(s1, s2, rtol=0.01)  # OK
Exemplo n.º 8
0
 def _detector_fit(self, X, y):
     return AnomalyDetector.fit(self, X, y)
Exemplo n.º 9
0
 def _create_detector(self, *ad_parms0, **ad_parms1):
     return AnomalyDetector(*ad_parms0, **ad_parms1)
Exemplo n.º 10
0
 def loglikelihood(self, X, y=None):
     return AnomalyDetector.loglikelihood(
         self, X,
         self.clf_.predict(X) if self.clf_ is not None and y is None else y)
Exemplo n.º 11
0
 def anomaly_score(self, X, y=None):
     return AnomalyDetector.anomaly_score(
         self, X,
         self.clf_.predict(X) if self.clf_ is not None and y is None else y)
Exemplo n.º 12
0
    def test_conditional_gaussian(self):
        x = array([[x0] for x0 in norm(0,1).rvs(1000)])

        gauss_scores = AnomalyDetector(P_Gaussian(0)).fit(x).anomaly_score(x)
        condgauss_scores = \
            AnomalyDetector(P_ConditionalGaussian([0], [])). \
                fit(x). \
                anomaly_score(x)

        assert_allclose(gauss_scores, condgauss_scores,atol=0.01,rtol=0.01)


        X = array([[x0, x1] for x0,x1 in zip(norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)) ])

        gauss_scores_X = AnomalyDetector(P_Gaussian([0])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(P_ConditionalGaussian([0],[1])). \
                fit(X). \
                anomaly_score(X)

        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=0.3)


        X = array([[x0, x0+0.1*x1] for x0,x1 in zip(norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)) ])


        # This is not equal at all
        gauss_scores_X = AnomalyDetector(P_Gaussian([0,1])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(P_ConditionalGaussian([0,1],[])). \
                fit(X). \
                anomaly_score(X)

        assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.994), True)
        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=2) # Very bad


        X = array([[x0, x0 + 0.1 * x1, x2] for x0, x1, x2 in c_[norm(0, 1).rvs(1000), norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)]])

        # This is not equal at all
        gauss_scores_X = AnomalyDetector(P_Gaussian([0, 1])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(P_ConditionalGaussian([0, 1], [])). \
                fit(X). \
                anomaly_score(X)

        assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.994), True)
        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=2)  # Very bad


        X = array(
            [[x0, x0 + 0.1 * x1, x2] for x0, x1, x2 in c_[norm(0, 1).rvs(1000), norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)]])

        # This is not equal at all
        gauss_scores_X = AnomalyDetector(P_Gaussian([0, 1,2])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(
                P_ConditionalGaussianCombiner([
                    P_ConditionalGaussian([0], [1,2]),
                    P_ConditionalGaussian([1], [2]),
                    P_ConditionalGaussian([2], []),
                ])). \
                fit(X). \
                anomaly_score(X)

        assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.98), True)
        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=5)  # Very bad


        # This is very much equal
        gauss_scores_X = AnomalyDetector(P_ConditionalGaussian([0, 1, 2], [])).fit(X).anomaly_score(X)
        condgauss_scores_X = \
            AnomalyDetector(
                P_ConditionalGaussianCombiner([
                    P_ConditionalGaussian([0], [1, 2]),
                    P_ConditionalGaussian([1], [2]),
                    P_ConditionalGaussian([2], []),
                ])). \
                fit(X). \
                anomaly_score(X)

        assert_allclose(gauss_scores_X, condgauss_scores_X, atol=0.001)


        # If we combine them using a ordinary combination rule by adding anomaly score together
        condgauss_scores_X2 = \
        AnomalyDetector(
            [
                P_ConditionalGaussian([0], [1, 2]),
                P_ConditionalGaussian([1], [2]),
                P_ConditionalGaussian([2], []),
            ], cr_plus). \
            fit(X). \
            anomaly_score(X)


        assert_equal((pearsonr(condgauss_scores_X, condgauss_scores_X2) > 0.99), True) # Good

        assert_allclose(condgauss_scores_X2, condgauss_scores_X, atol=2) # Bad


        #
        ad1 = AnomalyDetector(
            [P_Gaussian([i]) for i in range(len(X[0]))],
            cr_plus
        ).fit(X)
        s1 = ad1.anomaly_score(X)

        ad2 = AnomalyDetector(
            [P_ConditionalGaussian([i], []) for i in range(len(X[0]))],
            cr_plus
        ).fit(X)
        s2 = ad2.anomaly_score(X)

        print("r:", pearsonr(s1,s2))

        assert_allclose(s1, s2, rtol=0.01)  # OK
    def test_conditional_gaussian_dependency_matrix(self):
        length = 100
        n_samples = 1000
        X = array([sample_markov_chain(length) for _ in range(n_samples)])


        # Next two should be equal
        s0 = AnomalyDetector(
            P_ConditionalGaussianDependencyMatrix(list(range(length)),length)
        ).fit(X).anomaly_score(X)

        ad1=AnomalyDetector(
            P_ConditionalGaussianCombiner([P_ConditionalGaussian([i + 1], [i]) for i in range(length - 1)]+[P_ConditionalGaussian([0], [])]),
            cr_plus
        ).fit(X)
        s1 = ad1.anomaly_score(X)

        assert_allclose(s0, s1, rtol=0.0001)  # OK

        # Most likely, these two are not equal but highly correlated
        ad2=AnomalyDetector(
            [P_ConditionalGaussian([i], []) for i in range(length)],
            cr_plus
        ).fit(X)
        s2 = ad2.anomaly_score(X)

        ad3=AnomalyDetector(
            P_ConditionalGaussianCombiner([P_ConditionalGaussian([i], []) for i in range(length)]),
            cr_plus
        ).fit(X)
        s3 = ad3.anomaly_score(X)

        assert_equal(pearsonr(s2,s3)> 0.985, True)


        # Test classification
        Y = array([sample_markov_chain(length,0.2) for _ in range(n_samples)])
        Z = array([sample_markov_chain(length,0.3) for _ in range(n_samples)])


        data = r_[X,Y,Z]
        labels = r_[['X']*len(X), ['Y']*len(Y), ['Z']*len(Z)]

        data_index = shuffle(list(range(len(data))))
        training_set = data_index[:n_samples*2]
        test_set = data_index[n_samples*2:]

        models = {
            'independent gaussian':
                AnomalyDetector([P_Gaussian([i]) for i in range(length)],cr_plus),
            'independent conditional gaussian':
                AnomalyDetector([P_ConditionalGaussian([i], []) for i in range(length)],cr_plus),
            'independent conditional gaussian with combiner':
                AnomalyDetector(P_ConditionalGaussianCombiner([P_ConditionalGaussian([i], []) for i in range(length)])),
            'single conditional gaussian with combiner':
                AnomalyDetector(P_ConditionalGaussianCombiner([P_ConditionalGaussian([i], [i-1]) for i in range(1, length)]+
                                                              [P_ConditionalGaussian([0], [])])),
            'dependency matrix':
                AnomalyDetector(P_ConditionalGaussianDependencyMatrix(list(range(length)),length))
        }

        all_acc = {}
        for key in models:
            ad=models[key].fit(data[training_set], labels[training_set])

            adclf = SklearnClassifier.clf(ad)

            labels_predicted = adclf.predict(data[test_set])
            accuracy = sum(labels[test_set]==labels_predicted)/float(len(test_set))
            all_acc[key] = accuracy
            print(key, "accuracy = ", accuracy)


        assert_close(all_acc['independent gaussian'],all_acc['independent conditional gaussian'],decimal=2)
        assert_close(all_acc['independent gaussian'], all_acc['independent conditional gaussian with combiner'],decimal=2)
        assert_close(all_acc['single conditional gaussian with combiner'], all_acc['dependency matrix'],decimal=2)