def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(
        clf.decision_function(X, raw_values=True), clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    decision = clf.decision_function(X, raw_values=True)
    decision_transformed = clf.decision_function(X, raw_values=False)

    assert_array_almost_equal(decision, clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
    assert sum(y_pred == -1) == sum(decision_transformed < 0)
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    decision = clf.decision_function(X, raw_values=True)
    decision_transformed = clf.decision_function(X, raw_values=False)

    assert_array_almost_equal(decision, clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert (sum(y_pred == -1) == sum(decision_transformed < 0))
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert (sum(y_pred == -1) == sum(decisions < 0))
Exemplo n.º 5
0
    def calc(self,outliers_fraction):
        

        data, dqs, raw = self.get_data()
        clf = EllipticEnvelope(contamination=outliers_fraction)
        X = zip(data['Tbandwidth'],data['Tlatency'],data['Tframerate'])
        clf.fit(X)
        #data['y_pred'] = clf.decision_function(X).ravel()
        #data['y_pred'] = clf.decision_function(X).ravel()
        
        #threshold = np.percentile(data['y_pred'],100 * outliers_fraction)
        data['MDist']=clf.mahalanobis(X)
        
        #picking "bad" outliers, not good ones
        outliers = chi2_outliers(data, [.8,.9,.95], 3)
        #print outliers
        outliers = [i[i['Tbandwidth']<i['Tlatency']] for i in outliers]
        
        #outliers = data[data['y_pred']<threshold]
        #data['y_pred'] = data['y_pred'] > threshold
        #outliers = [x[['ticketid','MDist']].merge(raw, how='inner').drop_duplicates() for x in outliers]
        #print raw
        #outliers = [raw[raw['ticketid'].isin(j['ticketid'])] for j in outliers]
        outliers = [k[k['Tframerate']<(k['Tframerate'].mean()+k['Tframerate'].std())] for k in outliers] #making sure we don't remove aberrantly good framrates
        outliers = [t.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1) for t in outliers]
        
        #dqs = raw[raw['ticketid'].isin(dqs['ticketid'])]
        #data = data.sort_values('MDist', ascending=False).drop_duplicates()
        
        return outliers, dqs, data.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1)
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(
        scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert(sum(y_pred == -1) == sum(decisions < 0))
Exemplo n.º 7
0
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True),
                              clf.mahalanobis(X - clf.location_))
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
Exemplo n.º 8
0
def elliptEnvMethod(data, uniqueTrains, **kwargs):

    elp = EllipticEnvelope(support_fraction=1)
    elp.fit_predict(data)

    # Squared Mahalanobis distances of the points of data
    # Note this is the same as using the "elp.dist_" parameter
    m_d = elp.mahalanobis(data)

    # Get the regular Mahalanobis distances
    elp_d = np.sqrt(m_d)

    # IMPLEMENT THE AUTOMATED CUT-OFF
    SCORE_INCREASE_RATIO = 1.3

    sortD = np.sort(elp_d)
    sortD = sortD[math.floor(
        len(sortD) / 2):]  # Get the end half of the sorted list of scores

    ratioD = np.array([sortD[i] / sortD[i - 1] for i in range(1, len(sortD))])

    # print(f'\nSorted distances: {sortD}\n\n Ratios: {ratioD}')

    ind = np.where(ratioD > SCORE_INCREASE_RATIO)

    if len(ind[0]) >= 1:
        ind = ind[0][0] + 1
        SIGMA = sortD[
            ind]  # Get the score which increases by the score_ratio compared to the previous score
    else:
        SIGMA = 100.0  # use an arbritary high score as there are no big score jumps

    SIGMA = max(SIGMA, 4.0)  # Limit the SIGMA function from being too low

    # Segment
    labels = (elp_d >= SIGMA).astype(int)

    # labelOLD = (elp_d > 4.0).astype(int)

    if False:
        print('.dist_ = {m1}\t .m(data)={}'.format(m2))  #\n {m_d}')
        print("Sigma labels: {}".format(labels))
        print("\nCovariance = {}".format(elp.covariance_))

    if False:
        labelColours = 'white'

        fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
        colours = np.array(['blue', 'red'])

        ax[0].set_title('Elliptical envelope - Mahalanobis distance',
                        color='white')
        ax[0].plot(elp_d, 'bo', alpha=0.4)  #, color=colours[labels])

        ax[1].scatter(data[:, 0],
                      data[:, 1],
                      s=20,
                      color=colours[labels],
                      alpha=0.4)
        ax[1].set_title(
            'Elliptical envelope (adjusted cutoff={})'.format(SIGMA),
            color='white')

        # ax[2].scatter(data[:, 0], data[:, 1], s=20, color=colours[labelOLD], alpha=0.4)
        # ax[2].set_title('Elliptical envelope (adjusted cutoff={})'.format(4.0), color='white')

        for i, a in enumerate(ax.flat):
            ax[i].set_xlabel('Mean normalised')  #, fontsize = 15.0)
            ax[i].set_ylabel('Standard deviation normalised')

        # fig1 = plt.figure(figsize=(5,5))

        plt.show()

    anomalies = [
        train for ind, train in enumerate(uniqueTrains) if labels[ind] == 1
    ]

    anomalyDates = []
    if kwargs.get('dates', None) is not None:
        dates = kwargs.get('dates')

        anomalyDates = [dates[i] for i, val in enumerate(labels) if val == 1]

    return anomalies, labels, anomalyDates
Exemplo n.º 9
0
rot = pcac.components_
pd.DataFrame(rot[0, :], index=cfat.columns).round(3).T

#

pd.DataFrame(rot[1, :], index=cfat.columns).round(3).T

#

from sklearn.covariance import EllipticEnvelope
ee = EllipticEnvelope()
ee.fit(cfat)

#

md = np.sqrt(ee.mahalanobis(cfat))
n = len(md)
ix = np.arange(1, n + 1)
halfq = sp.stats.norm.ppf((n + ix) / (2 * n + 1)),
plt.scatter(halfq, np.sort(md))
plt.xlabel(r'$\chi^2$ quantiles')
plt.ylabel('Mahalanobis distances')

#

xmat = sm.add_constant(cfat)
lmod = sm.OLS(fat.brozek, xmat).fit()
lmod.sumary()

#
Exemplo n.º 10
0
import numpy as np
from scipy.io import loadmat
from sklearn.covariance import EmpiricalCovariance, EllipticEnvelope
from sklearn.metrics import accuracy_score, classification_report

cardio_data = loadmat('cardio.mat')
estimator = EmpiricalCovariance()
cov = estimator.fit(cardio_data['X'])
mahal_cov = cov.mahalanobis(cardio_data['X'])
# sort values and extract n maximum values
# number of outliers in cardio data = 176
indexes = np.argpartition(mahal_cov, 176)[-176:]
y_pred = np.zeros(cardio_data['y'].shape)
y_pred[indexes] = 1
print(classification_report(cardio_data['y'], y_pred))
print(accuracy_score(cardio_data['y'], y_pred))

cov = EllipticEnvelope().fit(np.dot(cardio_data['X'].T, cardio_data['X']))
mahal_cov = cov.mahalanobis(cardio_data['X'])

indexes = np.argpartition(mahal_cov, 176)[-176:]
y_pred = np.zeros(cardio_data['y'].shape)
y_pred[indexes] = 1
print(classification_report(cardio_data['y'], y_pred))
print(accuracy_score(cardio_data['y'], y_pred))
    fig = plt.figure()

    ax = fig.add_subplot(111)
    arr[:, 0] /= arr[:, 2]
    arr[:, 1] /= arr[:, 2]
    a = arr[:, 0:2]
    ee = EllipticEnvelope()
    try:
        ee.fit(a)
    except:
        continue

    dsts = ee.decision_function(a).ravel()
    m1, m2 = min(dsts), max(dsts)

    maaa = ee.mahalanobis(a)

    min_x, max_x = min(arr[:, 0]), max(arr[:, 0])
    min_y, max_y = min(arr[:, 1]), max(arr[:, 1])

    min_x -= 10
    max_x += 10
    min_y -= 10
    max_y += 10

    xx, yy = np.meshgrid(np.linspace(min_x, max_x, 500),
                         np.linspace(min_y, max_y, 500))
    Z = ee.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contour(xx,