def test_EmpiricalCovariance_validates_mahalanobis():
    """Checks that EmpiricalCovariance validates data with mahalanobis."""
    cov = EmpiricalCovariance().fit(X)

    msg = f"X has 2 features, but \\w+ is expecting {X.shape[1]} features as input"
    with pytest.raises(ValueError, match=msg):
        cov.mahalanobis(X[:, :2])
示例#2
0
  def plot_contours(self, ax, show=False):
    COV = self.emp_cov
    COV_slice = EmpiricalCovariance()
    COV_slice.location_ = np.array([ COV.location_[0], COV.location_[1] ])
    COV_slice.covariance_ = np.array([ COV.covariance_[ 0,0 ], COV.covariance_[ 0,1 ],
                                       COV.covariance_[ 1,0 ], COV.covariance_[ 1,1 ] ])
    COV_slice.covariance_ = COV_slice.covariance_.reshape((2,2))
    COV_slice.precision_ = np.array([ COV.precision_[ 0,0 ], COV.precision_[ 0,1 ],
                                      COV.precision_[ 1,0 ], COV.precision_[ 1,1 ] ])
    COV_slice.precision_ = COV_slice.precision_.reshape((2,2))

    # Show contours of the distance functions
    xx, yy = np.meshgrid(
          np.linspace(COV_slice.location_[0]-5*math.sqrt(COV_slice.covariance_[0,0]), COV_slice.location_[0]+5*math.sqrt(COV_slice.covariance_[0,0]), 100),
          np.linspace(COV_slice.location_[1]-5*math.sqrt(COV_slice.covariance_[1,1]), COV_slice.location_[1]+5*math.sqrt(COV_slice.covariance_[1,1]), 100),
    )
    zz = np.c_[xx.ravel(), yy.ravel()]

    # Empirical fit is not so good.  Don't plot this
    if False: # keep for debugging
      mahal_emp_cov = COV_slice.mahalanobis(zz)
      mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
      emp_cov_contour = ax.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  levels=[1.,2.,3.,4.,5.],
                                  #cmap=plt.cm.PuBu_r,
                                  cmap=plt.cm.cool_r,
                                  linestyles='dashed')

    COV = self.rob_cov
    COV_slice = EmpiricalCovariance()
    COV_slice.location_ = np.array([ COV.location_[0], COV.location_[1] ])
    COV_slice.covariance_ = np.array([ COV.covariance_[ 0,0 ], COV.covariance_[ 0,1 ],
                                       COV.covariance_[ 1,0 ], COV.covariance_[ 1,1 ] ])
    COV_slice.covariance_ = COV_slice.covariance_.reshape((2,2))
    COV_slice.precision_ = np.array([ COV.precision_[ 0,0 ], COV.precision_[ 0,1 ],
                                      COV.precision_[ 1,0 ], COV.precision_[ 1,1 ] ])
    COV_slice.precision_ = COV_slice.precision_.reshape((2,2))
    self.robust_model_XY = COV_slice

    # robust is better
    if show:
      mahal_robust_cov = COV_slice.mahalanobis(zz)
      mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
      robust_contour = ax.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 levels=[1.,2.,3.,4.,5.],
                                 #cmap=plt.cm.YlOrBr_r,
                                 cmap=plt.cm.spring_r,
                                 linestyles='dotted')
示例#3
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), squared=False), 0)
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert(np.amax(mahal_dist) < 250)
    assert(np.amin(mahal_dist) > 50)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)
示例#4
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), squared=False), 0)
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert(np.amax(mahal_dist) < 250)
    assert(np.amin(mahal_dist) > 50)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)
示例#5
0
class OneClassMahalanobis(BaseClassifier):
    _fit_params = ['perc_keep']
    _predict_params = []
    def __init__(self,*args, **kwargs):
#         BaseClassifier.__init__(self, *args, **kwargs)
        self.perc_keep = kwargs["perc_keep"]
    
    def fit(self, data):
        nu = 0.01
        n_sample  = data.shape[0]
        n_feature = data.shape[1]
        
        exclude = set()
        for d in range(n_feature):
            feature = data[:, d]
            s_feature = feature.copy()
            s_feature.sort()
            low = s_feature[int(n_sample*nu/2)]
            upp = s_feature[n_sample-int(n_sample*nu/2)]

            exld = numpy.nonzero(numpy.logical_or((feature > upp),(feature < low)))[0]
            [exclude.add(e) for e in exld]
            
        use = numpy.array([f for f in range(n_sample) if f not in exclude])
        
        data_ = data[use, :]
            
        self.cov = EmpiricalCovariance().fit(data_)
        
        dist = self.cov.mahalanobis(data)
        
        self.cutoff = numpy.percentile(dist, self.perc_keep)
        print self.cutoff
    

    
    def predict(self, data):
        mahal_dist = self.cov.mahalanobis(data)
        self.mahal_dist = mahal_dist
        print mahal_dist.min(), mahal_dist.max(), self.cutoff, (mahal_dist > self.cutoff).sum(), "of", len(mahal_dist)
        
        return (mahal_dist > self.cutoff).astype(numpy.uint8)*-2+1
    
    def decision_function(self, data=None):
        return self.mahal_dist
def detectOutlier(X):
    X = np.transpose(X)
    outlierVec = []
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    #robust_cov = MinCovDet().fit(X)
    robust_cov = EmpiricalCovariance().fit(X)
    outlierVec = robust_cov.mahalanobis(X)
    #proj
    return np.sqrt(outlierVec)
示例#7
0
def mahalanobisDistances(dm):
    reduced_data = PCA(n_components=2).fit_transform(dm)
    robust_cov = MinCovDet().fit(reduced_data)

    emp_cov = EmpiricalCovariance().fit(reduced_data)
    fig = plt.figure()
    plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
    subfig1 = plt.subplot(3, 1, 1)
    inlier_plot = subfig1.scatter(reduced_data[:, 0], reduced_data[:, 1], color='black', label='inliers')

    subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
    subfig1.set_title("Mahalanobis distances of a contaminated data set:")

    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
                     np.linspace(plt.ylim()[0], plt.ylim()[1], 100))
    zz = np.c_[xx.ravel(), yy.ravel()]

    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')

    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')

    plt.xticks(())
    plt.yticks(())

    # Plot the scores for each point
    emp_mahal = emp_cov.mahalanobis(reduced_data - np.mean(reduced_data, 0)) ** (0.33)
    subfig2 = plt.subplot(2, 2, 3)

    plt.yticks(())

    robust_mahal = robust_cov.mahalanobis(reduced_data - robust_cov.location_) ** (0.33)
    subfig3 = plt.subplot(2, 2, 4)

    plt.yticks(())

    plt.show()
def mahal_plot(e):
    first_half = e[1:len(e) - 1]
    second_half = e[2:len(e)]
    X = np.array([first_half, second_half])
    X = np.transpose(X)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(X)

    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance().fit(X)

    fig = plt.figure()

    # Show data set
    subfig1 = plt.subplot(1, 1, 1)
    inlier_plot = subfig1.scatter(first_half,
                                  second_half,
                                  color='black',
                                  label='daily diff in homes passed')

    subfig1.set_title("Mahalanobis distances of the iid invariants:")

    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(plt.xlim()[0],
                                     plt.xlim()[1], 800),
                         np.linspace(plt.ylim()[0],
                                     plt.ylim()[1], 100))

    zz = np.c_[xx.ravel(), yy.ravel()]

    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = subfig1.contour(xx,
                                      yy,
                                      np.sqrt(mahal_emp_cov),
                                      cmap=plt.cm.PuBu_r,
                                      linestyles='dashed')

    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = subfig1.contour(xx,
                                     yy,
                                     np.sqrt(mahal_robust_cov),
                                     cmap=plt.cm.YlOrBr_r,
                                     color='red',
                                     linewidth="3")

    subfig1.legend([
        emp_cov_contour.collections[1], robust_contour.collections[1],
        inlier_plot
    ], ['MLE dist', 'robust dist', 'kpis'],
                   loc="upper right",
                   borderaxespad=0)
    print(np.corrcoef(first_half, second_half))
    return (robust_cov, emp_cov)
示例#9
0
class Mahalanobis(BaseEstimator):
    """Mahalanobis distance estimator. Uses Covariance estimate
    to compute mahalanobis distance of the observations
    from the model.

    Parameters
    ----------
    robust : boolean to determine wheter to use robust estimator
        based on Minimum Covariance Determinant computation
    """
    def __init__(self, robust=False):
        if not robust:
            from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator  #
        else:
            from sklearn.covariance import MinCovDet as CovarianceEstimator  #
        self.model = CovarianceEstimator()
        self.cov = None

    def fit(self, X, y=None, **params):
        """Fits the covariance model according to the given training
        data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        -------
        self : object
            Returns self.
        """
        self.cov = self.model.fit(X)
        return self

    def score(self, X, y=None):
        """Computes the mahalanobis distances of given observations.

        The provided observations are assumed to be centered. One may want to
        center them using a location estimate first.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
          The observations, the Mahalanobis distances of the which we compute.

        Returns
        -------
        mahalanobis_distance : array, shape = [n_observations,]
            Mahalanobis distances of the observations.
        """

        #return self.model.score(X,assume_centered=True)
        return -self.model.mahalanobis(X - self.model.location_)**0.33
示例#10
0
文件: models.py 项目: pborky/pynfsa
class Mahalanobis (BaseEstimator):
    """Mahalanobis distance estimator. Uses Covariance estimate
    to compute mahalanobis distance of the observations
    from the model.

    Parameters
    ----------
    robust : boolean to determine wheter to use robust estimator
        based on Minimum Covariance Determinant computation
    """
    def __init__(self, robust=False):
        if not robust:
            from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator #
        else:
            from sklearn.covariance import MinCovDet as CovarianceEstimator #
        self.model = CovarianceEstimator()
        self.cov = None
    def fit(self, X, y=None, **params):
        """Fits the covariance model according to the given training
        data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        -------
        self : object
            Returns self.
        """
        self.cov = self.model.fit(X)
        return self
    def score(self, X, y=None):
        """Computes the mahalanobis distances of given observations.

        The provided observations are assumed to be centered. One may want to
        center them using a location estimate first.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
          The observations, the Mahalanobis distances of the which we compute.

        Returns
        -------
        mahalanobis_distance : array, shape = [n_observations,]
            Mahalanobis distances of the observations.
        """

        #return self.model.score(X,assume_centered=True)
        return - self.model.mahalanobis(X-self.model.location_) ** 0.33
示例#11
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, squared=False), 0)
    assert_raises(NotImplementedError,
                  cov.error_norm, emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    print(np.amin(mahal_dist), np.amax(mahal_dist))
    assert(np.amin(mahal_dist) > 0)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    # FIXME I don't know what this test does
    X_1sample = np.arange(5)
    cov = EmpiricalCovariance()
    assert_warns(UserWarning, cov.fit, X_1sample)
    assert_array_almost_equal(cov.covariance_,
                              np.zeros(shape=(5, 5), dtype=np.float64))

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_array_equal(cov.location_, np.zeros(X.shape[1]))
示例#12
0
    def get_dose_dist_from_control(self, control, metric='euclidean'):

        mean_control = np.mean(control, axis=0).reshape(1, -1)
        mean_dose = self.get_mean_doses()

        if 'eucl' in metric:
            from sklearn.metrics import euclidean_distances
            dist = euclidean_distances(mean_control, mean_dose)
        elif 'mahalan' in metric:
            from sklearn.covariance import EmpiricalCovariance
            cov = EmpiricalCovariance().fit(control)
            dist = cov.mahalanobis(mean_dose)

        return dist
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False):
    """
    See http://scikit-learn.org/0.13/modules/outlier_detection.html#\
        fitting-an-elliptic-envelop

    for details.
    """
    if df is None and ctry is None:
        raise ValueError('Either the country or a dataframe must be supplied')
    elif df is None:
        df = load_res(ctry, weighted=weighted)
    if inliers:
        df = get_inliers(df=df)
    X = df.values
    robust_cov = MinCovDet().fit(X)
    #-----------------------------------------------------------------------------
    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance().fit(X)
    #-----------------------------------------------------------------------------
    # Display results
    fig = plt.figure()
    fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
    #-----------------------------------------------------------------------------
    # Show data set
    ax1 = fig.add_subplot(1, 1, 1)
    ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.')
    ax1.set_title(country_code[ctry])
    #-----------------------------------------------------------------------------
    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1],
                                     100),
                         np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1],
                                     100))
    zz = np.c_[xx.ravel(), yy.ravel()]
    #-----------------------------------------------------------------------------
    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')
    #-----------------------------------------------------------------------------
    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')
    ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]],
               ['MLE dist', 'robust dist'],
               loc="upper right", borderaxespad=0)
    ax1.grid()
    return (fig, ax1, ctry)
示例#14
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, squared=False), 0)
    assert_raises(NotImplementedError,
                  cov.error_norm, emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    print np.amin(mahal_dist), np.amax(mahal_dist)
    assert(np.amin(mahal_dist) > 0)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    X_1sample = np.arange(5)
    cov = EmpiricalCovariance()
    with warnings.catch_warnings(record=True):
        cov.fit(X_1sample)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_equal(cov.location_, np.zeros(X.shape[1]))
示例#15
0
def outlier_rejection(feat, prob):
    '''
    '''
    
    from sklearn.covariance import EmpiricalCovariance #MinCovDet
    
    #real_cov
    #linalg.inv(real_cov)
    
    #robust_cov = MinCovDet().fit(feat)
    robust_cov = EmpiricalCovariance().fit(feat)
    dist = robust_cov.mahalanobis(feat - numpy.median(feat, 0))
    
    cut = scipy.stats.chi2.ppf(prob, feat.shape[1])
    return dist < cut
示例#16
0
def test_covariance():
    # Tests Covariance module on a simple dataset.
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
    with pytest.raises(NotImplementedError):
        cov.error_norm(emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert np.amin(mahal_dist) > 0

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    # Create X with 1 sample and 5 features
    X_1sample = np.arange(5).reshape(1, 5)
    cov = EmpiricalCovariance()
    warn_msg = (
        "Only one sample available. You may want to reshape your data array")
    with pytest.warns(UserWarning, match=warn_msg):
        cov.fit(X_1sample)

    assert_array_almost_equal(cov.covariance_,
                              np.zeros(shape=(5, 5), dtype=np.float64))

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def main():
    print ("Running CV on Mahalanobis Distance based approach.")
    mahanalobis()

    start_time = time.time()
    totalX = []
    totalY = []
    flag = True
    countTrain = 228000
    print ("\n\nNow testing on separate data.")
    with open("creditcard.csv", "rb") as f:
        data = csv.reader(f)
        for row in data:
            if flag:
                flag = False
                continue
            countTrain += 1
            if countTrain > 228000:          #CV on 80% of data
                totalX.append([float(i) for i in row[:-1]])
                totalY.append(int(row[-1]))
    print ("Data Loaded")
    totalX = scalar.fit_transform(totalX)
    clf = EmpiricalCovariance()
    clf.fit(totalX)
    distances = clf.mahalanobis(totalX)

    Y = []
    for i in range(len(totalY)):

        if np.log10(distances[i]) > 1.838:
            Y.append(1)
        else:
            Y.append(0)
    print("%s seconds" % (time.time() - start_time))
    print ("Results")
    auc = roc_auc_score(totalY, Y)
    print("Area under curve : " + str(auc))
    fpr, tpr, _ = roc_curve(totalY, Y)
    print ("False Positive Rate : " + str(fpr[1]))
    _, recall, _ = precision_recall_curve(totalY, Y)
    print ("Recall : " + str(recall[1]))

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % auc)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.show()
class ChangeDetector(object):
    """
    Joint Gaussian Change detector using a scikit learn style interface
    
    This class is really a wrapper around the methods in scikit learn for estimating covariance using
    robust or empirical methods and calculating the mahalanobis distances.

    """
    def __init__(self, method='robust', estimator_kw_args={}):
        if method is 'robust':
            self.covariance_estimator_ = MinCovDet(**estimator_kw_args)
        elif method is 'empirical':
            self.covariance_estimator_ = EmpiricalCovariance(
                **estimator_kw_args)
        else:
            raise ValueError(
                "{} is not a valid method. Must be one of 'robust' or 'empirical'"
                .format(method))

    def fit(self, X):
        """
        Fits the estimator.

        Parameters:
        -----------
        X - array of time series, shape (n_series, len_series)
        """
        self.covariance_estimator_ = self.covariance_estimator_.fit(X)
        return self

    def predict(self, X, threshold):
        """
        Returns true for each time series predicted as change. Also returns the mahalanobis distances

        parameters:
        -----------
        X - array of time series, shape (n_series, len_series)
        threshold - float

        returns:
        y_pred - shape (n_time_series), true of change detected
        distances - shape (n_time_series). The mahanobis distances of each time series under the fitted distribution
        """
        distances = self.covariance_estimator_.mahalanobis(X)
        return distances > threshold, distances
示例#19
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0)
    assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
    assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm="foo")
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    print(np.amin(mahal_dist), np.amax(mahal_dist))
    assert np.amin(mahal_dist) > 0

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0)

    # test with one sample
    X_1sample = np.arange(5)
    cov = EmpiricalCovariance()
    with warnings.catch_warnings(record=True):
        cov.fit(X_1sample)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_array_equal(cov.location_, np.zeros(X.shape[1]))
示例#20
0
def deneAna(dene,plot=False,output="dene.png"):
    X=np.array([sim[2:] for sim in dene])
    
    Xmean=np.mean(X,axis=0)
    Xtocov=scale(X, axis=0, with_mean=True, with_std=False, copy=True)

    emp_cov = EmpiricalCovariance(assume_centered=True).fit(Xtocov)
    mahal_dist = emp_cov.mahalanobis(Xtocov)
    max_dist=max(mahal_dist)

    if plot:
        chi2range=[5.991465,max_dist]
        outliers=[]
        #print "outliers in dene: ",
        for i in range(0,len(mahal_dist)):
            if mahal_dist[i]>chi2range[0]:
                outliers.append(i)
            plotdeneana(Xscaled,Xmean,emp_cov,chi2range,intnames,outliers,output,a,b,lc="blue")

    return emp_cov,Xmean, max_dist
示例#21
0
class OneClassMahalanobis(BaseClassifier):
    _fit_params = []
    def __init__(self, *args, **kwargs):
        pass
    
    def fit(self, data):
        #self.cov = MinCovDet().fit(data)
        self.cov = EmpiricalCovariance().fit(data)
    
    def predict(self, data):
        mahal_emp_cov = self.cov.mahalanobis(data)
        d = data.shape[1]
        thres = scipy.stats.chi2.ppf(0.95, d)
        
        self.mahal_emp_cov = mahal_emp_cov
        
        return (mahal_emp_cov > thres).astype(numpy.int32)*-2+1
    
    def decision_function(self, data):
        return self.mahal_emp_cov
class MahalanobisEstimator:
    """
    Store location and dispersion estimators of the empirical distribution of data
    provided in an array and allow computation of statistical distances.

    Parameters
    ----------
    arr : {pandas.DataFrame, np.ndarray}
        the matrix used to calculate covariance

    Attributes
    ----------
    sigma : np.array
        Fitted covariance matrix of sklearn.covariance.EmpiricalCovariance()

    Methods
    -------
    mahalanobis(X)
        Computes mahalanobis distance between the input array (self.arr) and the X
        array as provided
    """

    def __init__(self, arr: Union[pd.DataFrame, np.ndarray]):
        self.sigma = EmpiricalCovariance().fit(arr)

    def mahalanobis(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
        """Compute the mahalanobis distance between the empirical distribution described
        by this object and points in an array `X`.

        Parameters
        ----------
        X : {pandas.DataFrame, np.ndarray}
            A samples by features array-like matrix to compute mahalanobis distance
            between self.arr

        Returns
        -------
        numpy.array
            Mahalanobis distance between the input array and the original sigma
        """
        return self.sigma.mahalanobis(X)
示例#23
0
    def fit(self, X, y=None):

        scaler_norm = Normalizer(norm=self.norm).fit(X)
        df_all_norm = scaler_norm.transform(X).astype(float)

        df_all_norm = pd.DataFrame(data=np.hstack(
            (df_all_norm, y.reshape((-1, 1)))),
                                   columns=self.colNames)

        X_SCALED = df_all_norm.iloc[:, :-1].values

        robust_cov_all = EmpiricalCovariance().fit(X_SCALED[:, :])
        robust_mahal_all = robust_cov_all.mahalanobis(
            X_SCALED[:, :] - robust_cov_all.location_)**(0.33)

        #ALT:2
        rm = pd.DataFrame(robust_mahal_all, columns=["value"])

        iqr = float(rm["value"].quantile(0.75)) - float(
            rm["value"].quantile(0.25))
        outlierRatioRob_all_1 = rm["value"].quantile(0.75) + (1.5 * iqr)
        outlierRatioRob_all_2 = rm["value"].quantile(0.25) - (1.5 * iqr)

        print iqr, rm["value"].quantile(0.25), rm["value"].quantile(0.75)
        print "Ouliers min ratio:", outlierRatioRob_all_1, outlierRatioRob_all_2
        print "Num outliers detected:", len(
            X_SCALED[robust_mahal_all > outlierRatioRob_all_1, 0])
        print "Num outliers detected:", len(
            X_SCALED[robust_mahal_all < outlierRatioRob_all_2, 0])
        print[(self.codes[r], robust_mahal_all[r])
              for r in range(len(robust_mahal_all))]

        patients_out = self.codes[robust_mahal_all > outlierRatioRob_all_1]
        self.codesToDel.append(patients_out)
        print "Patients outliers above: {}".format(patients_out)

        patients_out = self.codes[robust_mahal_all < outlierRatioRob_all_2]
        self.codesToDel.append(patients_out)
        print "Patients outliers below: {}".format(patients_out)

        return self
示例#24
0
class MahalanobisDistance(DimReducer):
    """
    Computes a person's Mahalanobis distance 
    using the mean and covariance estimated from a set of young people.
    Uses sklearn; verified this matches up with the normal matrix computation.
    """
    def __init__(self, age_lower, age_upper):
        self.age_lower = age_lower
        self.age_upper = age_upper
        self.need_ages = True
        self.k = 1

    def _fit_from_processed_data(self, X, ages):
        young_people = (ages >= self.age_lower) & (ages <= self.age_upper)
        print("%i people between %s and %s used for mean/cov calculation" %
              (young_people.sum(), self.age_lower, self.age_upper))
        assert young_people.sum() > 1000
        self.model = EmpiricalCovariance(assume_centered=False)
        self.model.fit(X[young_people, :])

    def _get_projections_from_processed_data(self, X):
        md = np.sqrt(self.model.mahalanobis(X)).reshape([-1, 1])
        return md
def mahanalobis():
    totalX = []
    totalY = []
    flag = True
    countTrain = 0
    with open("creditcard.csv", "rb") as f:
        data = csv.reader(f)
        for row in data:
            if flag:
                flag = False
                continue
            if countTrain >= 228000:                            #test on 20% of data
                break
            countTrain += 1
            totalX.append([float(i) for i in row[:-1]])
            totalY.append(int(row[-1]))
    totalX = scalar.fit_transform(totalX)
    print ("Data Loaded")
    clf = EmpiricalCovariance()
    clf.fit(totalX)
    distances = clf.mahalanobis(totalX)

    Y = []
    for i in range(len(totalY)):

        if np.log10(distances[i]) > 1.838:
            Y.append(1)
        else:
            Y.append(0)
    print ("Results")
    auc = roc_auc_score(totalY, Y)
    print(auc)
    fpr, _, _ = roc_curve(totalY, Y)
    print (fpr[1])
    _, recall, _ = precision_recall_curve(totalY, Y)
    print (recall[1])
    return auc, fpr[1], recall[1]
示例#26
0
def main():
    parser = argparse.ArgumentParser(
        description='Plot outlier-like distances for a 2-dimensional dataset')
    parser.add_argument('dataset',
                        type=argparse.FileType('r'),
                        help='a CSV file containing the dataset')
    parser.add_argument(
        '--plot',
        type=str,
        choices=['train', 'grid'],
        default='grid',
        help='plot the dataset or a grid evenly distributed over its span')
    parser.add_argument('--plotdims',
                        type=int,
                        choices=[2, 3],
                        default=2,
                        help='the number of dimensions to plot')

    args = parser.parse_args()

    X = np.loadtxt(args.dataset, delimiter=',')
    fig = plt.figure()

    xformer = NullTransformer()

    if X.shape[1] > 2:
        xformer = PCA(n_components=2)
        X = xformer.fit_transform(X)

    if args.plotdims == 2:
        plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
    else:
        plt.scatter(X[:, 0], X[:, 1])
    plt.show(block=False)

    path_to_script = os.path.realpath(__file__)
    dir_of_script = os.path.dirname(path_to_script)
    dataset_path = dir_of_script + '/outliers.npy'
    np.save(dataset_path, X)

    ###########################################################################
    # Train autoencoder with the n samples until convergence.  Run
    # evenly distributed samples through the autoencoder and compute
    # their reconstruction error.
    ###########################################################################

    maxseq_orig = np.max(X)
    minseq_orig = np.min(X)
    seqrange = np.abs(maxseq_orig - minseq_orig)
    maxseq = maxseq_orig + 0.5 * seqrange
    minseq = minseq_orig - 0.5 * seqrange
    print("minseq", minseq, "maxseq", maxseq)
    if args.plot == 'grid':
        seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
        Xplot = np.array([_ for _ in product(seq, seq)])
    else:
        Xplot = X

    robust_cov = MinCovDet().fit(X)
    robust_md = robust_cov.mahalanobis(Xplot)

    empirical_cov = EmpiricalCovariance().fit(X)
    empirical_md = empirical_cov.mahalanobis(Xplot)

    # Assume Xplot is at least 2-dimensional.
    if Xplot.shape[1] > 2:
        Xplot2d = bh_sne(Xplot)
    else:
        Xplot2d = Xplot

    robust_md01 = robust_md - np.nanmin(robust_md)
    robust_md01 = robust_md01 / np.nanmax(robust_md01)

    empirical_md01 = empirical_md - np.nanmin(empirical_md)
    empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0],
                   Xplot2d[:, 1],
                   cmap=plt.cm.jet,
                   c=robust_md01,
                   s=60,
                   linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0],
                        Xplot2d[:, 1],
                        robust_md01,
                        cmap=plt.cm.jet,
                        color=robust_md01)
        ax.set_zlabel('Mahalanobis distance')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (robust covariance)')

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0],
                   Xplot2d[:, 1],
                   cmap=plt.cm.jet,
                   c=empirical_md01,
                   s=60,
                   linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0],
                        Xplot2d[:, 1],
                        empirical_md01,
                        cmap=plt.cm.jet,
                        color=empirical_md01)
        ax.set_zlabel('Mahalanobis distance')

    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (empirical covariance)')

    enc_dec = [
        # tanh encoder, linear decoder
        ['tanh', 'linear'],
        # sigmoid encoder, linear decoder
        ['sigmoid', 'linear'],
        #######################################################################
        # The reconstruction error of the autoencoders trained with the
        # remaining commented-out pairs don't seem to match Mahalanobis
        # distance very well.  Feel free to uncomment them to see for
        # yourself.
        # linear encoder, linear decoder
        # ['linear', 'linear'],
        # tanh encoder, tanh decoder
        # ['tanh', 'tanh'],
        # tanh encoder, sigmoid decoder
        # ['tanh', 'sigmoid'],
        # sigmoid encoder, tanh decoder
        # ['sigmoid', 'tanh'],
        # sigmoid encoder, sigmoid decoder
        # ['sigmoid', 'sigmoid']
        #######################################################################
    ]

    for i, act in enumerate(enc_dec):
        enc, dec = act
        if dec == 'linear':
            dec = None
        model = train_autoencoder(dataset_path,
                                  act_enc=enc,
                                  act_dec=dec,
                                  nvis=X.shape[1],
                                  nhid=16)

        Xshared = theano.shared(np.asarray(Xplot, dtype=theano.config.floatX),
                                borrow=True)
        f = theano.function([], outputs=model.reconstruct(Xshared))
        fit = f()
        error = reconstruction_error(Xplot, fit)

        error01 = error - np.nanmin(error)
        error01 = error01 / np.nanmax(error01)

        fig = plt.figure()
        if args.plotdims == 2:
            ax = fig.add_subplot(1, 1, 1)
            ax.scatter(Xplot2d[:, 0],
                       Xplot2d[:, 1],
                       cmap=plt.cm.jet,
                       c=error,
                       s=60,
                       linewidth='0')
        else:
            ax = fig.add_subplot(1, 1, 1, projection='3d')
            ax.plot_trisurf(Xplot2d[:, 0],
                            Xplot2d[:, 1],
                            error,
                            cmap=plt.cm.jet,
                            color=error01)
            ax.set_zlabel('Reconstruction error')

        ax.set_xlabel('x')
        ax.set_ylabel('y')
        encdec_type = ', '.join(act)
        ax.set_title('Reconstruction error (' + encdec_type + ')')

        print("Correlation of robust MD and reconstruction error (" +
              str(encdec_type) + ") " + str(pearsonr(robust_md, error)))
        print("Correlation of empirical MD and reconstruction error (" +
              str(encdec_type) + ") " + str(pearsonr(empirical_md, error)))

    print("Correlation of robust MD and empirical MD " +
          str(pearsonr(robust_md, empirical_md)))

    os.remove(dataset_path)
    os.remove('outliers.pkl')

    plt.show(block=True)
示例#27
0
fig = plt.figure()

# Show data set
subfig1 = plt.subplot(1, 1, 1)
inlier_plot = subfig1.scatter(first_half, second_half,
                              color='black', label='daily diff in homes passed')

subfig1.set_title("Mahalanobis distances of a contaminated data set:")

# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 800),
                     np.linspace(plt.ylim()[0], plt.ylim()[1], 100))

zz = np.c_[xx.ravel(), yy.ravel()]

mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')

mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')

subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
                inlier_plot],
               ['MLE dist', 'robust dist', 'kpis'],
               loc="upper right", borderaxespad=0)
print(np.corrcoef(first_half,second_half))
示例#28
0
def main():
    parser = argparse.ArgumentParser(
        description='Plot outlier-like distances for a 2-dimensional dataset')
    parser.add_argument(
        'dataset', type=argparse.FileType('r'),
        help='a CSV file containing the dataset')
    parser.add_argument(
        '--plot', type=str, choices=['train', 'grid'], default='grid',
        help='plot the dataset or a grid evenly distributed over its span')
    parser.add_argument(
        '--plotdims', type=int, choices=[2, 3], default=2,
        help='the number of dimensions to plot')

    args = parser.parse_args()

    X = np.loadtxt(args.dataset, delimiter=',')
    fig = plt.figure()

    xformer = NullTransformer()

    if X.shape[1] > 2:
        xformer = PCA(n_components=2)
        X = xformer.fit_transform(X)

    if args.plotdims == 2:
        plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
    else:
        plt.scatter(X[:, 0], X[:, 1])
    plt.show(block=False)

    path_to_script = os.path.realpath(__file__)
    dir_of_script = os.path.dirname(path_to_script)
    dataset_path = dir_of_script + '/outliers.npy'
    np.save(dataset_path, X)
    
    ###########################################################################
    # Train autoencoder with the n samples until convergence.  Run
    # evenly distributed samples through the autoencoder and compute
    # their reconstruction error.
    ###########################################################################

    maxseq_orig = np.max(X)
    minseq_orig = np.min(X)
    seqrange = np.abs(maxseq_orig - minseq_orig)
    maxseq = maxseq_orig + 0.5 * seqrange
    minseq = minseq_orig - 0.5 * seqrange
    print("minseq", minseq, "maxseq", maxseq)
    if args.plot == 'grid':
        seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
        Xplot = np.array([_ for _ in product(seq, seq)])
    else:
        Xplot = X

    robust_cov = MinCovDet().fit(X)
    robust_md = robust_cov.mahalanobis(Xplot)

    empirical_cov = EmpiricalCovariance().fit(X)
    empirical_md = empirical_cov.mahalanobis(Xplot)

    # Assume Xplot is at least 2-dimensional.
    if Xplot.shape[1] > 2:
        Xplot2d = bh_sne(Xplot)
    else:
        Xplot2d = Xplot

    robust_md01 = robust_md - np.nanmin(robust_md)
    robust_md01 = robust_md01 / np.nanmax(robust_md01)

    empirical_md01 = empirical_md - np.nanmin(empirical_md)
    empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 
            cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01,
            cmap=plt.cm.jet, color=robust_md01)
        ax.set_zlabel('Mahalanobis distance')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (robust covariance)')

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 
            cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01,
            cmap=plt.cm.jet, color=empirical_md01)
        ax.set_zlabel('Mahalanobis distance')

    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (empirical covariance)')
    
    enc_dec = [
        # tanh encoder, linear decoder
        ['tanh', 'linear'],
        # sigmoid encoder, linear decoder
        ['sigmoid', 'linear'],
        #######################################################################
        # The reconstruction error of the autoencoders trained with the
        # remaining commented-out pairs don't seem to match Mahalanobis
        # distance very well.  Feel free to uncomment them to see for
        # yourself.
        # linear encoder, linear decoder
        # ['linear', 'linear'],
        # tanh encoder, tanh decoder
        # ['tanh', 'tanh'],
        # tanh encoder, sigmoid decoder
        # ['tanh', 'sigmoid'],
        # sigmoid encoder, tanh decoder
        # ['sigmoid', 'tanh'],
        # sigmoid encoder, sigmoid decoder
        # ['sigmoid', 'sigmoid']
        #######################################################################
    ]
    
    for i, act in enumerate(enc_dec):
        enc, dec = act
        if dec == 'linear':
            dec = None
        model = train_autoencoder(dataset_path,
            act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16)
        
        Xshared = theano.shared(
            np.asarray(Xplot, dtype=theano.config.floatX), borrow=True)
        f = theano.function([], outputs=model.reconstruct(Xshared))
        fit = f()
        error = reconstruction_error(Xplot, fit)

        error01 = error - np.nanmin(error)
        error01 = error01 / np.nanmax(error01)
        
        fig = plt.figure()
        if args.plotdims == 2:
            ax = fig.add_subplot(1, 1, 1)
            ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
                cmap=plt.cm.jet, c=error, s=60, linewidth='0')
        else:
            ax = fig.add_subplot(1, 1, 1, projection='3d')
            ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error,
                cmap=plt.cm.jet, color=error01)
            ax.set_zlabel('Reconstruction error')

        ax.set_xlabel('x')
        ax.set_ylabel('y')
        encdec_type = ', '.join(act) 
        ax.set_title('Reconstruction error (' + encdec_type + ')')

        print("Correlation of robust MD and reconstruction error (" +
            str(encdec_type) + ") " + str(pearsonr(robust_md, error)))
        print("Correlation of empirical MD and reconstruction error (" +
            str(encdec_type) + ") " + str(pearsonr(empirical_md, error)))

    print("Correlation of robust MD and empirical MD " +
        str(pearsonr(robust_md, empirical_md)))

    os.remove(dataset_path)
    os.remove('outliers.pkl')

    plt.show(block=True)
示例#29
0
import matplotlib.cm as cm


# Import data
data = pd.read_excel('C:/Users/dorta/Dropbox/Stanford/GS 240/Homeworks/Hmk4//ilr_data.xls')
ilr_cols = ['ilr'+str(x) for x in range(1,30)]
data_ilr = data.loc[:,ilr_cols]

# -------------------------- Outlier Detection --------------------------
# Fit the covariances
robust_cov = MinCovDet().fit(data_ilr)
emp_cov = EmpiricalCovariance().fit(data_ilr)

# Get the Mahalanobis distances
robust_dist = np.sqrt(robust_cov.mahalanobis(data_ilr))
classic_dist = np.sqrt(emp_cov.mahalanobis(data_ilr))

# Chi squared test at p=0.025
thresh = np.sqrt(chi2.isf(0.025, len(ilr_cols)))


# Plot of the outliers
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(classic_dist[robust_dist<thresh], robust_dist[robust_dist<thresh], s=7, c='c', marker="+", label='inliers')
ax1.scatter(classic_dist[robust_dist>thresh], robust_dist[robust_dist>thresh], s=7, c='r', marker="+", label='outliers')
x = np.linspace(*ax1.get_xlim())
ax1.plot(x, x, linewidth=1, linestyle='--', color='b')
ax1.plot([0, 20], [thresh, thresh], linewidth=0.5, linestyle='--', color='r')
ax1.plot([thresh, thresh], [0, 40], linewidth=0.5, linestyle='--', color='r')
plt.legend(loc='upper left')
# save for heuristic correction
age = df_test['var15']
age_ecdf = ECDF(df_train['var15'])
df_train['var15'] = age_ecdf(df_train['var15'])
df_test['var15'] = age_ecdf(df_test['var15'])

# feature engineering
df_train.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0
df_train['num_zeros'] = (df_train == 0).sum(axis=1)
df_test.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0
df_test['num_zeros'] = (df_test == 0).sum(axis=1)

# outliers
ec = EmpiricalCovariance()
ec = ec.fit(df_train)
m2 = ec.mahalanobis(df_train)
df_train = df_train[m2 < 40000]
df_target = df_target[m2 < 40000]

# clip
# df_test = df_test.clip(df_train.min(), df_train.max(), axis=1)

# standard preprocessing
prep = Pipeline([
    ('cd', ColumnDropper(drop=ZERO_VARIANCE_COLUMNS + CORRELATED_COLUMNS)),
    ('std', StandardScaler())
])

X_train = prep.fit_transform(df_train)
X_test = prep.transform(df_test)
y_train = df_target.values
示例#31
0
    y[n_outliers:] = -1

    # 2D plot
    plt.scatter(X_01[:-n_outliers, 0], X_01[:-n_outliers, 1], c=colors[0])
    plt.scatter(X_01[-n_outliers:, 0],
                X_01[-n_outliers:, 1],
                c=colors[0],
                marker='x')

    cov_emp = EmpiricalCovariance().fit(X_01_in)
    print('Covarience: ' + cov_emp.covariance_)

    xx, yy = np.meshgrid(np.linspace(-7, 7, 150), np.linspace(-7, 7, 150))

    Z = cov_emp.mahalanobis(
        np.c_[xx.ravel(), yy.ravel()]
    ) > chi2_interval_max  # maker sure the degree of freedom for Chi2 is correct
    Z = Z.reshape(xx.shape)
    plt.contour(xx, yy, Z, levels=[0], linewidths=200, colors='black')

    outlier_pred = cov_emp.mahalanobis(X_01) > chi2_interval_max
    outlier_true = y == -1

    plt.scatter(X_01[outlier_pred & outlier_true, 0],
                X_01[outlier_pred & outlier_true, 1],
                c=colors[1],
                marker='x',
                label='TP')
    plt.scatter(X_01[~outlier_pred & outlier_true, 0],
                X_01[~outlier_pred & outlier_true, 1],
                c=colors[0],
# Show data set
subfig1 = pl.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
                              color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                               color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")

# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(pl.xlim()[0], pl.xlim()[1], 100),
                     np.linspace(pl.ylim()[0], pl.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]

mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=pl.cm.PuBu_r,
                                  linestyles='dashed')

mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=pl.cm.YlOrBr_r, linestyles='dotted')

subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
                inlier_plot, outlier_plot],
               ['MLE dist', 'robust dist', 'inliers', 'outliers'],
               loc="upper right", borderaxespad=0)
pl.xticks(())
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:],
                               X[:, 1][-n_outliers:],
                               color='red',
                               label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")

# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0],
                                 plt.xlim()[1], 100),
                     np.linspace(plt.ylim()[0],
                                 plt.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]

mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx,
                                  yy,
                                  np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')

mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx,
                                 yy,
                                 np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r,
                                 linestyles='dotted')
示例#34
0
    discriminator.save_weights('models/dis_weights_%d.h5' % id_remove)
    with open('models/dis_architecture_%d.json' % id_remove, 'w') as f:
        f.write(discriminator.to_json())

    X_test_remaining = X_test.copy()
    X_test = np.vstack([removed_routes, X_test_remaining])
    X_test = np.reshape(X_test, (len(X_test), TRIP_SIZE, NUM_VALS, 1))

    Y_test = np.hstack([np.ones(len(removed_routes)), np.zeros(len(X_test_remaining))])

    # Mahalanobis
    encodings_train = encoder.predict(X_train)
    emp_cov = EmpiricalCovariance().fit(encodings_train)

    encodings_test = encoder.predict(X_test)
    emp_mahal = emp_cov.mahalanobis(encodings_test)

    val_arr = np.asarray(emp_mahal)
    val_probs = val_arr / max(val_arr)

    roc_auc = roc_auc_score(Y_test, val_probs)
    prauc = average_precision_score(Y_test, val_probs)
    roc_auc_scores.append(roc_auc)
    prauc_scores.append(prauc)

    print("ROC AUC SCORE FOR %d: %f" % (id_remove, roc_auc))
    print("PRAUC SCORE FOR %d: %f" % (id_remove, prauc))

    np.savetxt('auc/roc_auc_scores.txt', roc_auc_scores, fmt='%5s', delimiter=",")
    plt.scatter(np.arange(len(roc_auc_scores)), roc_auc_scores)
    plt.savefig("auc/roc_auc_scores.png")
示例#35
0
emp_cov = EmpiricalCovariance().fit(X)


# Display results
fig = pl.figure()

# Show data set
subfig1 = pl.subplot(3, 1, 1)
subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers')
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
subfig1.legend(loc="upper right")

emp_mahal = emp_cov.mahalanobis(X) ** (0.33)
subfig2 = pl.subplot(2, 2, 3)
subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
subfig2.plot(1.26 * np.ones(n_samples - n_outliers),
             emp_mahal[:-n_outliers], '+k', markeredgewidth=1)
subfig2.plot(2.26 * np.ones(n_outliers),
             emp_mahal[-n_outliers:], '+k', markeredgewidth=1)
subfig2.axes.set_xticklabels(('inliers', 'outliers'), size=11)
subfig2.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$")
subfig2.set_title("1. from non-robust estimates\n(Maximum Likelihood)")

robust_mahal = robust_cov.mahalanobis(X) ** (0.33)
subfig3 = pl.subplot(2, 2, 4)
subfig3.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]],
                widths=.25)
subfig3.plot(1.26 * np.ones(n_samples - n_outliers),
示例#36
0
# Show data set
subfig1 = plt.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
                              color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                               color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")

# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
                     np.linspace(plt.ylim()[0], plt.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]

mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')

mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')

subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
                inlier_plot, outlier_plot],
               ['MLE dist', 'robust dist', 'inliers', 'outliers'],
               loc="upper right", borderaxespad=0)
plt.xticks(())
offset_bottom = fig.subplotpars.bottom
width = fig.subplotpars.right - offset_left
subfig1 = pl.subplot(3, 1, 1)
subfig2 = pl.subplot(3, 1, 2)
subfig3 = pl.subplot(3, 1, 3)

# Show data set
subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers')
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
subfig1.legend(loc="upper right")

# Empirical covariance -based Mahalanobis distances
subfig2.scatter(np.arange(n_samples), emp_cov.mahalanobis(X),
                color='black', label='inliers')
subfig2.scatter(np.arange(n_samples)[-n_outliers:],
                emp_cov.mahalanobis(X)[-n_outliers:],
                color='red', label='outliers')
subfig2.set_ylabel("Mahal. dist.")
subfig2.set_title("1. from empirical estimates")
subfig2.axes.set_position(pos=[offset_left, 0.39, width, .2])

# MCD-based Mahalanobis distances
subfig3.scatter(np.arange(n_samples), robust_cov.mahalanobis(X),
                color='black', label='inliers')
subfig3.scatter(np.arange(n_samples)[-n_outliers:],
                robust_cov.mahalanobis(X)[-n_outliers:],
                color='red', label='outliers')
subfig3.set_ylabel("Mahal. dist.")