def _h_getMahalanobisRobust(dat, critical_alpha=0.01, good_rows=np.zeros(0)):
    '''Calculate the Mahalanobis distance from the sample vector.'''
    if good_rows.size == 0:
        good_rows = np.any(~np.isnan(dat), axis=1)

    try:
        dat2fit = dat[good_rows]
        assert not np.any(np.isnan(dat2fit))

        robust_cov = MinCovDet().fit(dat2fit)
        mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat))
    except ValueError:
        # this step will fail if the covariance matrix is not singular. This happens if the data is not
        # a unimodal symetric distribution. For example there is too many small noisy particles. Therefore
        # I will take a safe option and return zeros in the mahalanobis
        # distance if this is the case.
        mahalanobis_dist = np.zeros(dat.shape[0])

    # critial distance of the maholanobis distance using the chi-square distirbution
    # https://en.wikiversity.org/wiki/Mahalanobis%27_distance
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
    maha_lim = chi2.ppf(1 - critical_alpha, dat.shape[1])
    outliers = mahalanobis_dist > maha_lim

    return mahalanobis_dist, outliers, maha_lim
예제 #2
0
    def _naiveMCD(self, dataset, thresh=3):

        types = LoLTypeInference().getDataTypes(dataset)
        qdataset = [[d[i] for i, t in enumerate(types) if t == 'numerical']
                    for d in dataset]

        X = featurize(qdataset, [t for t in types if t == 'numerical'])
        xshape = np.shape(X)

        #for conditioning problems with the estimate
        Xsamp = X + 0.01 * np.random.randn(xshape[0], xshape[1])

        m = MinCovDet()
        m.fit(Xsamp)
        sigma = np.linalg.inv(m.covariance_)
        mu = np.mean(X, axis=0)

        results = []
        for i in range(0, xshape[0]):
            val = np.squeeze((X[i, :] - mu) * sigma * (X[i, :] - mu).T)[0, 0]
            results.append([str(val)])

        e = ErrorDetector(results,
                          modules=[QuantitativeErrorModule],
                          config=[{
                              'thresh': thresh
                          }])
        e.fit()

        return set([error['cell'][0] for error in e])
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
                          tol_support):

    rand_gen = np.random.RandomState(0)
    data = rand_gen.randn(n_samples, n_features)
    # add some outliers
    outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
    outliers_offset = 10. * \
        (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
    data[outliers_index] += outliers_offset
    inliers_mask = np.ones(n_samples).astype(bool)
    inliers_mask[outliers_index] = False

    pure_data = data[inliers_mask]
    # compute MCD by fitting an object
    mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
    T = mcd_fit.location_
    S = mcd_fit.covariance_
    H = mcd_fit.support_
    # compare with the estimates learnt from the inliers
    error_location = np.mean((pure_data.mean(0) - T) ** 2)
    assert(error_location < tol_loc)
    error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
    assert(error_cov < tol_cov)
    assert(np.sum(H) >= tol_support)
    assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
예제 #4
0
def outliers_finder(data_frame: pd.DataFrame) -> pd.DataFrame:
    """
    Finding and removing outliers
    :param data_frame:
    :return:
    """
    (df_X, df_y) = splitting_dataset(data_frame)
    # Define the PCA object
    pca = PCA()

    # Run PCA on scaled data and obtain the scores array
    T = pca.fit_transform(StandardScaler().fit_transform(df_X.values))

    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(T[:, :5])

    # Get the Mahalanobis distance
    m = robust_cov.mahalanobis(T[:, :5])

    data_frame['mahalanobis'] = m

    # calculate p-value for each mahalanobis distance
    data_frame['p'] = 1 - chi2.cdf(data_frame['mahalanobis'], 3)
    data_frame.sort_values('p', ascending=False)
    Drops = (data_frame['p'] <= 0.001)
    data_frame['Drops'] = (data_frame['p'] <= 0.001)

    indexNames = data_frame[data_frame['Drops'] == True].index
    print(indexNames.size)
    data_frame.drop(indexNames, inplace=True)

    return data_frame
예제 #5
0
    def obtenerOutliersMinCovarianza(self, datosOriginales, datosATestear):
        clf = MinCovDet().fit(datosOriginales)
        resultadoValoresATestear = clf.predict(datosATestear)

        listaOutliers, listaInliers = self.getListasOutliersInliers(
            resultadoValoresATestear, datosATestear)
        return listaOutliers, listaInliers
예제 #6
0
def test_mcd_issue1127():
    # Check that the code does not break with X.shape = (3, 1)
    # (i.e. n_support = n_samples)
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(3, 1))
    mcd = MinCovDet()
    mcd.fit(X)
def getMahalanobisRobust(dat, critical_alpha = 0.01, good_rows = np.zeros(0)):

    '''Calculate the Mahalanobis distance from the sample vector.'''
    
    
    if good_rows.size == 0:
        good_rows = np.any(~np.isnan(dat), axis=1);
    
    #import pdb
    #pdb.set_trace()

    try:

        robust_cov = MinCovDet().fit(dat[good_rows])
        mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat))
    except ValueError:
        #this step will fail if the covariance matrix is not singular. This happens if the data is not 
        #a unimodal symetric distribution. For example there is too many small noisy particles. Therefore
        #I will take a safe option and return zeros in the mahalanobis distance if this is the case.
        mahalanobis_dist = np.zeros(dat.shape[0])

    #critial distance of the maholanobis distance using the chi-square distirbution
    #https://en.wikiversity.org/wiki/Mahalanobis%27_distance
    #http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
    maha_lim = chi2.ppf(1-critical_alpha, dat.shape[1])
    outliers = mahalanobis_dist>maha_lim
    
    return mahalanobis_dist, outliers, maha_lim
    def __init__(self):
        # Config for loading no action spectrum (noise data)
        rospack = rospkg.RosPack()
        self.train_dir = osp.join(rospack.get_path(
            'decopin_hand'), 'train_data')
        if not osp.exists(self.train_dir):
            makedirs(self.train_dir)
        self.noise_data_path = osp.join(self.train_dir, 'noise.npy')
        if not osp.exists(self.noise_data_path):
            rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path))
            exit()
        no_action_data = np.load(self.noise_data_path)
        # extract about 100 data from no_action_data
        divide = max(1, len(no_action_data) / 100)
        no_action_data = no_action_data[::divide]
        # Detect in action or not by mahalanobis distance
        self.anormal_threshold = rospy.get_param('~anormal_threshold')
        self.mcd = MinCovDet()
        self.mcd.fit(no_action_data)
        rospy.loginfo('Calc covariance matrix for Mahalanobis distance')

        # ROS
        self.bridge = CvBridge()
        self.pub = rospy.Publisher('~in_action', Bool, queue_size=1)
        self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb)
예제 #9
0
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
                          tol_support):
    rand_gen = np.random.RandomState(0)
    data = rand_gen.randn(n_samples, n_features)
    # add some outliers
    outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
    outliers_offset = 10. * \
                      (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
    data[outliers_index] += outliers_offset
    inliers_mask = np.ones(n_samples).astype(bool)
    inliers_mask[outliers_index] = False

    pure_data = data[inliers_mask]
    # compute MCD by fitting an object
    mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
    T = mcd_fit.location_
    S = mcd_fit.covariance_
    H = mcd_fit.support_
    # compare with the estimates learnt from the inliers
    error_location = np.mean((pure_data.mean(0) - T) ** 2)
    assert (error_location < tol_loc)
    error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
    assert (error_cov < tol_cov)
    assert (np.sum(H) >= tol_support)
    assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
예제 #10
0
    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = MinCovDet(store_precision=self.store_precision,
                                   assume_centered=self.assume_centered,
                                   support_fraction=self.support_fraction,
                                   random_state=self.random_state)
        self.detector_.fit(X=X, y=y)

        # Use mahalanabis distance as the outlier score
        self.decision_scores_ = self.detector_.dist_
        self._process_decision_scores()
        return self
def test_mcd_increasing_det_warning():
    # Check that a warning is raised if we observe increasing determinants
    # during the c_step. In theory the sequence of determinants should be
    # decreasing. Increasing determinants are likely due to ill-conditioned
    # covariance matrices that result in poor precision matrices.

    X = [[5.1, 3.5, 1.4, 0.2],
         [4.9, 3.0, 1.4, 0.2],
         [4.7, 3.2, 1.3, 0.2],
         [4.6, 3.1, 1.5, 0.2],
         [5.0, 3.6, 1.4, 0.2],
         [4.6, 3.4, 1.4, 0.3],
         [5.0, 3.4, 1.5, 0.2],
         [4.4, 2.9, 1.4, 0.2],
         [4.9, 3.1, 1.5, 0.1],
         [5.4, 3.7, 1.5, 0.2],
         [4.8, 3.4, 1.6, 0.2],
         [4.8, 3.0, 1.4, 0.1],
         [4.3, 3.0, 1.1, 0.1],
         [5.1, 3.5, 1.4, 0.3],
         [5.7, 3.8, 1.7, 0.3],
         [5.4, 3.4, 1.7, 0.2],
         [4.6, 3.6, 1.0, 0.2],
         [5.0, 3.0, 1.6, 0.2],
         [5.2, 3.5, 1.5, 0.2]]

    mcd = MinCovDet(random_state=1)
    warn_msg = "Determinant has increased"
    with pytest.warns(RuntimeWarning, match=warn_msg):
        mcd.fit(X)
예제 #12
0
def find_outliers_mahalanobis(featMatProjected,
                              extremeness=2.,
                              figsize=[8, 8],
                              saveto=None):
    """ A function to determine to return a list of outlier indices using the
        Mahalanobis distance. 
        Outlier threshold = std(Mahalanobis distance) * extremeness degree 
        [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule]
    """
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from pathlib import Path
    from sklearn.covariance import MinCovDet
    from matplotlib import pyplot as plt

    # NB: Euclidean distance puts more weight than it should on correlated variables
    # Chicken and egg situation, we can’t know they are outliers until we calculate
    # the stats of the distribution, but the stats of the distribution are skewed by outliers!
    # Mahalanobis gets around this by weighting by robust estimation of covariance matrix

    # Fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(
        featMatProjected[:, :10])  # Use the first 10 principal components

    # Get the Mahalanobis distance
    MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10])

    projectedTable = pd.DataFrame(featMatProjected[:,:10],\
                      columns=['PC' + str(n+1) for n in range(10)])

    plt.ioff() if saveto else plt.ion()
    plt.close('all')
    plt.style.use(CUSTOM_STYLE)
    sns.set_style('ticks')
    fig, ax = plt.subplots(figsize=figsize)
    ax.set_facecolor('#F7FFFF')
    plt.scatter(np.array(projectedTable['PC1']),
                np.array(projectedTable['PC2']),
                c=MahalanobisDist)  # colour PCA by Mahalanobis distance
    plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20)
    plt.colorbar()
    ax.grid(False)

    if saveto:
        saveto.parent.mkdir(exist_ok=True, parents=True)
        suffix = Path(saveto).suffix.strip('.')
        plt.savefig(saveto, format=suffix, dpi=300)
    else:
        plt.show()

    k = np.std(MahalanobisDist) * extremeness
    upper_t = np.mean(MahalanobisDist) + k
    outliers = []
    for i in range(len(MahalanobisDist)):
        if (MahalanobisDist[i] >= upper_t):
            outliers.append(i)
    print("Outliers found: %d" % len(outliers))

    return np.array(outliers)
예제 #13
0
    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = MinCovDet(store_precision=self.store_precision,
                                   assume_centered=self.assume_centered,
                                   support_fraction=self.support_fraction,
                                   random_state=self.random_state)
        self.detector_.fit(X=X, y=y)

        # Use mahalanabis distance as the outlier score
        self.decision_scores_ = self.detector_.dist_
        self._process_decision_scores()
        return self
def test_mcd_issue1127():
    # Check that the code does not break with X.shape = (3, 1)
    # (i.e. n_support = n_samples)
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(3, 1))
    mcd = MinCovDet()
    mcd.fit(X)
class ActionDetector(object):
    """
    Publish whether the robot is in action or not to rostopic, by MT method.

    NOTE
    Before starting to detect action, some waiting time is required.
    This is preparation time to calculate mahalanobis distance.
    Reaction speed for action detection is a bit late
    because spectrum is mean of spectrogram, not right edge of spectrogram
    """

    def __init__(self):
        # Config for loading no action spectrum (noise data)
        rospack = rospkg.RosPack()
        self.train_dir = osp.join(rospack.get_path(
            'decopin_hand'), 'train_data')
        if not osp.exists(self.train_dir):
            makedirs(self.train_dir)
        self.noise_data_path = osp.join(self.train_dir, 'noise.npy')
        if not osp.exists(self.noise_data_path):
            rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path))
            exit()
        no_action_data = np.load(self.noise_data_path)
        # extract about 100 data from no_action_data
        divide = max(1, len(no_action_data) / 100)
        no_action_data = no_action_data[::divide]
        # Detect in action or not by mahalanobis distance
        self.anormal_threshold = rospy.get_param('~anormal_threshold')
        self.mcd = MinCovDet()
        self.mcd.fit(no_action_data)
        rospy.loginfo('Calc covariance matrix for Mahalanobis distance')

        # ROS
        self.bridge = CvBridge()
        self.pub = rospy.Publisher('~in_action', Bool, queue_size=1)
        self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb)

    def cb(self, msg):
        """
        Main process of NoiseSaver class
        Publish whether the robot is in action or not
        """

        # spectrogram.shape is (height, width) = (spectrum, time)
        spectrogram = self.bridge.imgmsg_to_cv2(msg)
        self.current_spectrum = np.average(spectrogram, axis=1)
        # Check whether current spectrogram is in action or not
        spectrum = self.current_spectrum[None]
        dist = self.mcd.mahalanobis(spectrum)[0]
        info_message = '(mahalanobis distance, threshold) = ({}, {})'.format(
            dist, self.anormal_threshold)
        if dist < self.anormal_threshold:
            self.in_action = False
            rospy.loginfo('No action\n' + info_message + '\n')
        else:
            self.in_action = True
            rospy.loginfo('### In action ###\n' + info_message + '\n')
        pub_msg = Bool(data=self.in_action)
        self.pub.publish(pub_msg)
예제 #16
0
def mahalanobis_calculate(data, num_pcs):
    pca = PCA(num_pcs)
    T = pca.fit_transform(data)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(T)
    # Get the Mahalanobis distance
    m = robust_cov.mahalanobis(T)
    return m
def as7262_outliers(data, scatter_correction=None):
    data_columns = data[as7262_wavelengths]
    print(data_columns)
    # data_columns.T.plot()
    # plt.plot(data_columns.T)
    plt.show()
    if scatter_correction == "SNV":
        data_columns = processing.snv(data_columns)
    elif scatter_correction == "MSC":
        data_columns, _ = processing.msc(data_columns)

    # svm = OneClassSVM().fit_predict(snv_data)
    # print(svm)
    robust_cov = MinCovDet().fit(data_columns)
    mahal_dist = robust_cov.mahalanobis(data_columns)
    # mahal_dist = MahalanobisDist(np.array(data_columns), verbose=True)
    print(mahal_dist)


    zscore(data_columns)
    print('+++++')
    mean = np.mean(mahal_dist)
    std = 3*np.std(mahal_dist)
    print(mean, std)
    print(mean - std, mean + std)
    zscore_mahal = (mahal_dist - mean) / np.std(mahal_dist)
    # print(zscore_mahal)
    # print(zscore_mahal.max(), zscore_mahal.argmax(), data_columns.loc[zscore_mahal.argmax()])
    print('pppp')
    print(data_columns)
    print(zscore_mahal.argmax())
    outliers = data_columns.loc[zscore_mahal > 3].index
    outliers = data_columns.iloc[zscore_mahal.argmax()].name
    # print(data_columns.loc[zscore_mahal > 3].index)
    rows = data_columns.loc[outliers]
    # print(data_columns.loc[zscore_mahal.argmax()].name)
    print(data_columns.shape)
    print(rows)

    # print((mahal_dist-mahal_dist.mean()).std())
    # print(mahal_dist.std())
    # print(mahal_dist.mean() + 3*mahal_dist.std())
    # mahal_dist2 = MahalanobisDist(np.array(data_columns), verbose=True)
    n, bins, _ = plt.hist(zscore_mahal, bins=40)
    plt.show()

    # x_hist = np.linspace(min(mahal_dist), max(mahal_dist), 100)
    #
    # popt, pcov = curve_fit(gauss_function, bins[:len(n)], n, maxfev=100000, p0=[300, 0, 20])
    # new_fit = gauss_function(x_hist, *popt)
    # plt.plot(x_hist, new_fit, 'r--')
    # color = data_columns.shape[0] * ["#000000"]
    # color[data_columns.loc[zscore_mahal.argmax()].name] = "#FF0000"
    plt.plot(data_columns.T, c="black")
    plt.plot(rows.T, c="red")
    plt.plot(data_columns.mean(), c="blue", lw=4)
    # snv_data.T.plot(color=color)
    plt.show()
예제 #18
0
def detect(train_data: np.ndarray, test_data: np.ndarray) -> list:
    estimated_covarianvce = MinCovDet().fit(train_data)
    train_dist = estimated_covarianvce.mahalanobis(train_data)
    np_max = np.max(train_dist)

    return [
        0 if data <= np_max else 1
        for data in estimated_covarianvce.mahalanobis(test_data)
    ]
def mahal_plot(e):
    first_half = e[1:len(e) - 1]
    second_half = e[2:len(e)]
    X = np.array([first_half, second_half])
    X = np.transpose(X)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(X)

    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance().fit(X)

    fig = plt.figure()

    # Show data set
    subfig1 = plt.subplot(1, 1, 1)
    inlier_plot = subfig1.scatter(first_half,
                                  second_half,
                                  color='black',
                                  label='daily diff in homes passed')

    subfig1.set_title("Mahalanobis distances of the iid invariants:")

    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(plt.xlim()[0],
                                     plt.xlim()[1], 800),
                         np.linspace(plt.ylim()[0],
                                     plt.ylim()[1], 100))

    zz = np.c_[xx.ravel(), yy.ravel()]

    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = subfig1.contour(xx,
                                      yy,
                                      np.sqrt(mahal_emp_cov),
                                      cmap=plt.cm.PuBu_r,
                                      linestyles='dashed')

    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = subfig1.contour(xx,
                                     yy,
                                     np.sqrt(mahal_robust_cov),
                                     cmap=plt.cm.YlOrBr_r,
                                     color='red',
                                     linewidth="3")

    subfig1.legend([
        emp_cov_contour.collections[1], robust_contour.collections[1],
        inlier_plot
    ], ['MLE dist', 'robust dist', 'kpis'],
                   loc="upper right",
                   borderaxespad=0)
    print(np.corrcoef(first_half, second_half))
    return (robust_cov, emp_cov)
예제 #20
0
 def leverage(self, X):
     mcd = MinCovDet()
     mcd.fit(X)
     loc, cov = mcd.location_, mcd.covariance_
     inversed_cov = np.linalg.inv(cov)
     result = np.zeros(X.shape[0])
     for i, element in enumerate(X):
         h = np.sqrt(
             np.transpose(element - loc) @ inversed_cov @ (element - loc))
         result[i] = h
     return result
예제 #21
0
파일: scores.py 프로젝트: kelicht/dace
 def __init__(self, estimator='ML', tol=1e-6):
     if (estimator == 'ML'):
         self.estimator_ = EmpiricalCovariance(store_precision=True,
                                               assume_centered=False)
     elif (estimator == 'MCD'):
         self.estimator_ = MinCovDet(store_precision=True,
                                     assume_centered=False,
                                     support_fraction=None,
                                     random_state=0)
     else:
         self.estimator_ = None
     self.tol_ = tol
예제 #22
0
def l_ratio(X, labels):
    ''' This is a meassure of how far a cluster is from neighbouring clusters
        computing the mahalanobis distance to the closest point that does not
        belong to the cluster

        ATENTION:   the covariance matrix is estimated with the robust
                    covariance (outliers not taken into account)

    Parameters
    ----------
        X : ndarray
            Data (assumed to be multivariate normal distributed)
        labels : ndarray
            Labels

    Returns
    -------
        lr : list, size(number of clusters)
            L-ratio for each cluster
    '''
    lr = list()

    # unique labels
    unique_l = set(labels).difference([-1])

    # if the set is empty, return 0
    if len(unique_l)==0:
        return -1

    # degrees of freedom
    df = len(X[0])

    # for each cluster
    for label in unique_l:
        # compute points in cluster
        Xi = X[(labels==label)]

        # number of spikes in cluster
        n = len(Xi)

        # compute points out of the cluster
        outliers = X[(labels!=label)]

        # estimate robust covariance
        mcd = MinCovDet().fit(Xi)

        # compute mahalanobis distance for outliers
        Dmcd = mcd.mahalanobis(outliers)

        # compute L-ratio
        lr.append(np.sum(1-chi2.cdf(Dmcd,df))/n)

    return lr
예제 #23
0
def get_outliers(X, chi2thr=0.975, plot=False, figurename=None):
    """ detect outliers by Mahalanobis distance
    """
    robust_cov = MinCovDet(random_state=100).fit(X)
    MD = robust_cov.mahalanobis(X)
    n_samples = len(MD)
    chi2 = stats.chi2
    degrees_of_freedom = X.shape[1]
    threshold = chi2.ppf(chi2thr, degrees_of_freedom)
    y_pred = MD > threshold
    outlierpercent = sum(y_pred) / float(n_samples)
    return outlierpercent, y_pred, MD
예제 #24
0
def RejectOutliers(data, threshold=3):
    """
    Rejects nodal outliers based on :threshold: away from the mean based on the
    mahalanobis distance
    """
    from sklearn.covariance import MinCovDet
    clf = MinCovDet()
    clf.fit(data)
    distances = clf.mahalanobis(data)

    outliers = np.where(distances >= threshold)[0]
    inliers = np.where(distances < threshold)[0]
    return inliers, outliers
예제 #25
0
  def __init__(self, lab_coords_x, lab_coords_y, data, i_panel, delta_scalar, params, verbose=False):
    training_data = []

    mean_x = flex.mean(lab_coords_x)
    mean_y = flex.mean(lab_coords_y)
    limit=delta_scalar * 10

    for ix in range(len(data)):
      if abs(lab_coords_x[ix] - mean_x) > limit: continue
      if abs(lab_coords_y[ix] - mean_y) > limit: continue
      if abs(data[ix])>1: continue
      training_data.append((lab_coords_x[ix],lab_coords_y[ix],data[ix]))
    if verbose: print("Training data is less",len(lab_coords_x) - len(training_data),end=" ")
    colorcode_set = []
    for ix in range(len(data)):
      colorcode_set.append((lab_coords_x[ix],lab_coords_y[ix],data[ix]))

    from sklearn.covariance import EmpiricalCovariance, MinCovDet
    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance(assume_centered=False, store_precision=True).fit(X=training_data)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet(assume_centered=False, store_precision=True).fit(X=training_data)

    features = ["Δx","Δy","ΔΨ(deg)"]
    if verbose:
      print("%3d"%i_panel,end=" ")
      print("%4d items "%(len(training_data),),end=" ")
    for idx_report in range(len(features)):
      feature = features[idx_report]
      diag_elem = math.sqrt(emp_cov.covariance_[idx_report,idx_report])
      if verbose: print( "%s=%7.2f±%6.2f"%(feature, emp_cov.location_[idx_report], diag_elem),end=" ")

    if verbose: print("%4d items:"%(flex.bool(robust_cov.support_).count(True)),end=" ")
    for idx_report in range(len(features)):
      feature = features[idx_report]
      diag_elem = math.sqrt(robust_cov.covariance_[idx_report,idx_report])
      if verbose: print( "%s=%7.2f±%6.2f"%(feature, robust_cov.location_[idx_report], diag_elem),end=" ")

    disc = flex.double(robust_cov.mahalanobis(X=colorcode_set)) # this metric represents malahanobis ** 2
    disc_select = disc < (params.residuals.mcd_filter.mahalanobis_distance)**2
    if params.residuals.mcd_filter.keep == "outliers":
      disc_select = (disc_select==False)
    if verbose: print("OK %4.1f%%"%(100*(disc_select.count(True))/len(training_data)))
    self.lab_coords_x = lab_coords_x.select(disc_select)
    self.lab_coords_y = lab_coords_y.select(disc_select)
    self.data = data.select(disc_select)
    self.n_input = len(lab_coords_x)
    self.n_output = len(self.lab_coords_x)
    self.emp_cov = emp_cov
    self.rob_cov = robust_cov
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False):
    """
    See http://scikit-learn.org/0.13/modules/outlier_detection.html#\
        fitting-an-elliptic-envelop

    for details.
    """
    if df is None and ctry is None:
        raise ValueError('Either the country or a dataframe must be supplied')
    elif df is None:
        df = load_res(ctry, weighted=weighted)
    if inliers:
        df = get_inliers(df=df)
    X = df.values
    robust_cov = MinCovDet().fit(X)
    #-----------------------------------------------------------------------------
    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance().fit(X)
    #-----------------------------------------------------------------------------
    # Display results
    fig = plt.figure()
    fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
    #-----------------------------------------------------------------------------
    # Show data set
    ax1 = fig.add_subplot(1, 1, 1)
    ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.')
    ax1.set_title(country_code[ctry])
    #-----------------------------------------------------------------------------
    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1],
                                     100),
                         np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1],
                                     100))
    zz = np.c_[xx.ravel(), yy.ravel()]
    #-----------------------------------------------------------------------------
    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')
    #-----------------------------------------------------------------------------
    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')
    ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]],
               ['MLE dist', 'robust dist'],
               loc="upper right", borderaxespad=0)
    ax1.grid()
    return (fig, ax1, ctry)
예제 #27
0
 def fit(self, X):
     """Fit detector.
     Parameters
     ----------
     X : numpy array of shape (n_samples, n_features)
         The input samples.
     """
     self.X_train = check_array(X)
     self.mcd = MinCovDet(store_precision=self.store_precision,
                     assume_centered=self.assume_centered,
                     support_fraction=self.support_fraction,
                     random_state=self.random_state)
     self.mcd.fit(X=X, y=y)
     pass
예제 #28
0
def wcorr(x, y, w=None, robust=False):
    '''Weighted correlation coeffient
    
    Calculate the Pearson linear correlation coefficient of x and y using weights w. 
    This is derived from the weighted covariance and weighted variance.
    
    Args:
        x,y    : array of values 
        w      : array of weights for each element of x
        robust : (boolean) robust weights will be internally calculated using FastMCD;
                 only used if robust=True and w is empty
        
    Returns:
        scalar : weighted covariance   
    '''

    n = len(x)
    assert len(y) == n, 'y must be the same length as x'

    # Use FastMCD to calculate weights; Another method could be used here
    if (w == None):
        w = MinCovDet().fit(np.array([x, y]).T).support_

    if (len(w) == 0):
        raise SystemExit('must specify weights w or select robust=True')
    assert len(w) == n, 'w must be the same length as x and y'
    w = wscale(w)
    return wcov(x, y, w) / np.sqrt(wvar(x, w) * wvar(y, w))
예제 #29
0
def wcov(x, y, w=None, ddof=1, robust=False):
    '''Weighted covariance 
    
    Calculate the covariance of x and y using weights w. If ddof=1 (default),
    then the result is the unbiased (sample) covariance when w=1.
    
    Implements weighted covariance as defined by NIST Dataplot (https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf)
    
    Args:
        x,y    : array of values 
        w      : array of weights for each element of x; can be ommitted if robust=True
        ddof   : scalar differential degrees of freedom (Default ddof=1)
        robust : (boolean) robust weights will be internally calculated using FastMCD;
                 only used if robust=True and w is empty
        
    Returns:
        scalar : weighted covariance   
    '''
    n = len(x)
    assert len(y) == n, 'y must be the same length as x'

    # Use FastMCD to calculate weights; Another method could be used here
    if (robust and w == None):
        w = MinCovDet().fit(np.array([x, y]).T).support_

    if (len(w) == 0):
        raise SystemExit('must specify weights w or select robust=True')
    assert len(w) == n, 'w must be the same length as x and y'

    w = wscale(w)
    nw = np.count_nonzero(w)

    return np.sum( ( x - wmean(x,w) ) * ( y - wmean(y,w) ) * w ) / \
        ( np.sum(w) / nw * (nw - ddof) )
예제 #30
0
def test_mcd_issue3367():
    # Check that MCD completes when the covariance matrix is singular
    # i.e. one of the rows and columns are all zeros
    rand_gen = np.random.RandomState(0)

    # Think of these as the values for X and Y -> 10 values between -5 and 5
    data_values = np.linspace(-5, 5, 10).tolist()
    # Get the cartesian product of all possible coordinate pairs from above set
    data = np.array(list(itertools.product(data_values, data_values)))

    # Add a third column that's all zeros to make our data a set of point
    # within a plane, which means that the covariance matrix will be singular
    data = np.hstack((data, np.zeros((data.shape[0], 1))))

    # The below line of code should raise an exception if the covariance matrix
    # is singular. As a further test, since we have points in XYZ, the
    # principle components (Eigenvectors) of these directly relate to the
    # geometry of the points. Since it's a plane, we should be able to test
    # that the Eigenvector that corresponds to the smallest Eigenvalue is the
    # plane normal, specifically [0, 0, 1], since everything is in the XY plane
    # (as I've set it up above). To do this one would start by:
    #
    #     evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
    #     normal = evecs[:, np.argmin(evals)]
    #
    # After which we need to assert that our `normal` is equal to [0, 0, 1].
    # Do note that there is floating point error associated with this, so it's
    # best to subtract the two and then compare some small tolerance (e.g.
    # 1e-12).
    MinCovDet(random_state=rand_gen).fit(data)
예제 #31
0
def wmean(x, w=None, robust=False):
    '''Weighted mean 
    
    Calculate the mean of x using weights w.
    
    Args:
        x : array of values to be averaged
        w      : array of weights for each element of x; can be ommitted if robust=True
        robust : (boolean) robust weights will be internally calculated using FastMCD;
                 only used if robust=True and w is empty
        
    Returns:
        scalar : weighted mean    
    '''
    if (w != None):
        assert len(w) == len(x), 'w must be the same length as x'

    # Use FastMCD to calculate weights; Another method could be used here
    if (robust and w == None):
        w = MinCovDet().fit(np.array([x, x]).T).support_

    if (len(w) == 0):
        raise SystemExit('must specify weights w or select robust=True')
    assert len(w) == len(x), 'w must be the same length as x'

    return np.sum(x * w) / np.sum(w)
예제 #32
0
def robust_mahalanobis_method(x=None, data=None):
    #Minimum covariance determinant method
    rng = np.random.RandomState(0)
    real_cov = np.cov(data.values.T)
    X = rng.multivariate_normal(mean=np.mean(data, axis=0),
                                cov=real_cov,
                                size=506)
    cov = MinCovDet(random_state=0).fit(X)
    mcd = cov.covariance_  #robust covariance metric
    robust_mean = cov.location_  #robust mean
    inv_covmat = sp.linalg.inv(mcd)  #inverse covariance metric

    #Calculate MD with minimum covariance determinant method
    x_minus_mu = x - robust_mean
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    md = mahal.diagonal()

    #Compare rMD with threshold and flag as outlier
    outlier = []
    C = chi2.ppf((1 - 0.001),
                 df=x.shape[1])  #degrees of freedom = number of variables
    for index, value in enumerate(md):
        if value > C:
            outlier.append(index)
        else:
            continue
    return outlier, md
def compute_MCD_weft(weftsPickled, target_path):

    weft_points_list = floatPointList()
    for pickled_path in weftsPickled:
        weft_points_list.extend(pickle.load(open(pickled_path, "rb" )))

    x_vals = [fp.x for fp in weft_points_list]
    y_vals = [fp.y for fp in weft_points_list]

    mean_hor_dist = weft_points_list.getMedianWeftDist()

    min_x = min(x_vals) + 1.5 * mean_hor_dist
    max_x = max(x_vals) - 1.5 * mean_hor_dist
    min_y = min(y_vals) + 1.5 * mean_hor_dist
    max_y = max(y_vals) - 1.5 * mean_hor_dist

    inner_points = floatPointList()
    for pt in weft_points_list:
        if min_x < pt.x < max_x and min_y < pt.y < max_y:
            inner_points.append(pt)

    X = np.zeros([len(inner_points), 3])

    for idx, pt in enumerate(inner_points):
        X[idx,0] = pt.area
        X[idx,1] = pt.right_dist
        X[idx,2] = pt.left_dist

    Y = X[~(X<=0).any(axis=1)]

    robust_cov = MinCovDet(support_fraction=0.8).fit(Y)
    pickle.dump(robust_cov, open(target_path, "wb"))
예제 #34
0
def robust_mahalanobis_method(df):
    #Minimum covariance determinant
    rng = np.random.RandomState(0)
    real_cov = np.cov(df.values.T)
    X = rng.multivariate_normal(mean=np.mean(df, axis=0),
                                cov=real_cov,
                                size=506)
    cov = MinCovDet(random_state=0).fit(X)
    mcd = cov.covariance_  #robust covariance metric
    robust_mean = cov.location_  #robust mean
    inv_covmat = sp.linalg.inv(mcd)  #inverse covariance metric

    #Robust M-Distance
    x_minus_mu = df - robust_mean
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    md = np.sqrt(mahal.diagonal())

    #Flag as outlier
    outlier = []
    C = np.sqrt(chi2.ppf(
        (1 - 0.001),
        df=df.shape[1]))  #degrees of freedom = number of variables
    for index, value in enumerate(md):
        if value > C:
            outlier.append(index)
        else:
            continue
    return outlier, md
예제 #35
0
    def analyze(self, mahalanobis_tolerance=2):
        self.inlier_points = np.zeros((len(self.points), 2))
        for id1 in range(len(self.points)):
            id2 = closest_point(self.points, self.points[id1], id1)[0]

            #keep lines fro plotting purposes
            self.linedata[3*id1] = self.points[id1]
            self.linedata[3*id1+1] = self.points[id2]
            self.linedata[3*id1+2] = [None, None]

            # we are repeating every pi/2, so we compress the angle space by 4x
            a = 4*math.atan2((self.points[id1, 1] - self.points[id2, 1]), (self.points[id1, 0] - self.points[id2, 0]))
            r = np.linalg.norm(self.points[id1] - self.points[id2])
            self.polardata[id1] = [r*math.cos(a), r*math.sin(a)]

        #find the minimal covariance inlier cluster
        self.polar_cov = MinCovDet().fit(self.polardata)

        # extract the grid angle and size.  angle is divided by 4 because
        # we previously scaled it up to repeat every 90 deg
        self.theta = math.atan2(-self.polar_cov.location_[1], self.polar_cov.location_[0])/4
        self.step_size = np.linalg.norm(self.polar_cov.location_)

        # extract inlier points
        polar_mahal = self.polar_cov.mahalanobis(self.polardata)**(0.33)
        inlier_count = 0
        for i in range(len(polar_mahal)):
            if polar_mahal[i] < mahalanobis_tolerance: # stdev tolerance to outliers
                self.inlier_points[inlier_count] = self.points[i]
                self.inlier_indicies[inlier_count] = i
                inlier_count += 1

        self.normalized_points = rotate(self.inlier_points[:inlier_count], -self.theta)/self.step_size

        #enumerate grid IDs
        origin_id = closest_point(self.normalized_points, np.mean(self.normalized_points))[0]
        self.normalized_points = self.normalized_points - self.normalized_points[origin_id]
        inlier_count = 0

        self.bounds = [sys.maxint, sys.maxint, -sys.maxint, -sys.maxint]
        for p in self.normalized_points:
            x = round(p[0])
            y = round(p[1])
            d = np.linalg.norm(p-[x, y])
            if d < 0.4: #tolerance from unit position
                self.normalized_points[inlier_count] = [x, y]
                if (x < self.bounds[0]):
                    self.bounds[0] = x
                if (x > self.bounds[2]):
                    self.bounds[2] = x
                if (y < self.bounds[1]):
                    self.bounds[1] = y
                if (y > self.bounds[3]):
                    self.bounds[3] = y
                inlier_count += 1

        self.normalized_points = self.normalized_points[:inlier_count]
예제 #36
0
def estimateGaussian(nb_objects_init, nb_objects_final, thr, who, genes, siRNA,
                     loadingFolder = '../resultData/thrivisions/predictions',
                     threshold=0.05,):
    
    arr=np.vstack((thr, nb_objects_init, nb_objects_final)).T    
    #deleting siRNAs that have only one experiment
    print len(siRNA)
    all_=Counter(siRNA);siRNA = np.array(siRNA)
    toDelsi=filter(lambda x: all_[x]==1, all_)
    toDelInd=[]
    for si in toDelsi:
        toDelInd.extend(np.where(siRNA==si)[0])
    print len(toDelInd)
    dd=dict(zip(range(4), [arr, who, genes, siRNA]))
    for array_ in dd:
        dd[array_]=np.delete(dd[array_],toDelInd,0 )
    arr, who, genes, siRNA = [dd[el] for el in range(4)]
    
    print arr.shape
    
    arr_ctrl=arr[np.where(np.array(genes)=='ctrl')]
    ctrlcov=MinCovDet().fit(arr_ctrl)
    
    robdist= ctrlcov.mahalanobis(arr)*np.sign(arr[:,0]-np.mean(arr[:,0]))
    new_siRNA=np.array(siRNA)[np.where((genes!='ctrl')&(robdist>0))]
    pval,qval =empiricalPvalues(np.absolute(robdist[np.where(genes=='ctrl')])[:, np.newaxis],\
                           robdist[np.where((genes!='ctrl')&(robdist>0))][:, np.newaxis],\
                           folder=loadingFolder, name="thrivision", sup=True, also_pval=True)
    assert new_siRNA.shape==qval.shape
    hits=Counter(new_siRNA[np.where(qval<threshold)[0]])
    
    hits=filter(lambda x: float(hits[x])/all_[x]>=0.5, hits)
    gene_hits = [genes[list(siRNA).index(el)] for el in hits]
    gene_hits=Counter(gene_hits)
    
    return robdist, pval,qval, hits, gene_hits
예제 #37
0
#X1 = preprocessing.scale(X2)
n_samples = len(X)
n_outliers = n_samples*0.05
n_features = 2

# generate data
# gen_cov = np.eye(n_features)
# gen_cov[0, 0] = 2.
# X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
# # add some outliers
# outliers_cov = np.eye(n_features)
# outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
# X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)

# fit a Minimum Covariance Determinant (MCD) robust estimator to data
robust_cov = MinCovDet().fit(X)

# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)

###############################################################################
# Display results
fig = plt.figure()
plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)

# Show data set
subfig1 = plt.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
                              color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                               color='red', label='outliers')
n_samples = 125
n_outliers = 25
n_features = 2

# generate data
gen_cov = np.eye(n_features)
gen_cov[0, 0] = 2.
X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
# add some outliers
outliers_cov = np.eye(n_features)
outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)

# fit a Minimum Covariance Determinant (MCD) robust estimator to data
robust_cov = MinCovDet().fit(X)

# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)

###############################################################################
# Display results
fig = pl.figure()
pl.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)

# Show data set
subfig1 = pl.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
                              color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                               color='red', label='outliers')
예제 #39
0
def main():
    parser = argparse.ArgumentParser(
        description='Plot outlier-like distances for a 2-dimensional dataset')
    parser.add_argument(
        'dataset', type=argparse.FileType('r'),
        help='a CSV file containing the dataset')
    parser.add_argument(
        '--plot', type=str, choices=['train', 'grid'], default='grid',
        help='plot the dataset or a grid evenly distributed over its span')
    parser.add_argument(
        '--plotdims', type=int, choices=[2, 3], default=2,
        help='the number of dimensions to plot')

    args = parser.parse_args()

    X = np.loadtxt(args.dataset, delimiter=',')
    fig = plt.figure()

    xformer = NullTransformer()

    if X.shape[1] > 2:
        xformer = PCA(n_components=2)
        X = xformer.fit_transform(X)

    if args.plotdims == 2:
        plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
    else:
        plt.scatter(X[:, 0], X[:, 1])
    plt.show(block=False)

    path_to_script = os.path.realpath(__file__)
    dir_of_script = os.path.dirname(path_to_script)
    dataset_path = dir_of_script + '/outliers.npy'
    np.save(dataset_path, X)
    
    ###########################################################################
    # Train autoencoder with the n samples until convergence.  Run
    # evenly distributed samples through the autoencoder and compute
    # their reconstruction error.
    ###########################################################################

    maxseq_orig = np.max(X)
    minseq_orig = np.min(X)
    seqrange = np.abs(maxseq_orig - minseq_orig)
    maxseq = maxseq_orig + 0.5 * seqrange
    minseq = minseq_orig - 0.5 * seqrange
    print("minseq", minseq, "maxseq", maxseq)
    if args.plot == 'grid':
        seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
        Xplot = np.array([_ for _ in product(seq, seq)])
    else:
        Xplot = X

    robust_cov = MinCovDet().fit(X)
    robust_md = robust_cov.mahalanobis(Xplot)

    empirical_cov = EmpiricalCovariance().fit(X)
    empirical_md = empirical_cov.mahalanobis(Xplot)

    # Assume Xplot is at least 2-dimensional.
    if Xplot.shape[1] > 2:
        Xplot2d = bh_sne(Xplot)
    else:
        Xplot2d = Xplot

    robust_md01 = robust_md - np.nanmin(robust_md)
    robust_md01 = robust_md01 / np.nanmax(robust_md01)

    empirical_md01 = empirical_md - np.nanmin(empirical_md)
    empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 
            cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01,
            cmap=plt.cm.jet, color=robust_md01)
        ax.set_zlabel('Mahalanobis distance')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (robust covariance)')

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 
            cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01,
            cmap=plt.cm.jet, color=empirical_md01)
        ax.set_zlabel('Mahalanobis distance')

    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (empirical covariance)')
    
    enc_dec = [
        # tanh encoder, linear decoder
        ['tanh', 'linear'],
        # sigmoid encoder, linear decoder
        ['sigmoid', 'linear'],
        #######################################################################
        # The reconstruction error of the autoencoders trained with the
        # remaining commented-out pairs don't seem to match Mahalanobis
        # distance very well.  Feel free to uncomment them to see for
        # yourself.
        # linear encoder, linear decoder
        # ['linear', 'linear'],
        # tanh encoder, tanh decoder
        # ['tanh', 'tanh'],
        # tanh encoder, sigmoid decoder
        # ['tanh', 'sigmoid'],
        # sigmoid encoder, tanh decoder
        # ['sigmoid', 'tanh'],
        # sigmoid encoder, sigmoid decoder
        # ['sigmoid', 'sigmoid']
        #######################################################################
    ]
    
    for i, act in enumerate(enc_dec):
        enc, dec = act
        if dec == 'linear':
            dec = None
        model = train_autoencoder(dataset_path,
            act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16)
        
        Xshared = theano.shared(
            np.asarray(Xplot, dtype=theano.config.floatX), borrow=True)
        f = theano.function([], outputs=model.reconstruct(Xshared))
        fit = f()
        error = reconstruction_error(Xplot, fit)

        error01 = error - np.nanmin(error)
        error01 = error01 / np.nanmax(error01)
        
        fig = plt.figure()
        if args.plotdims == 2:
            ax = fig.add_subplot(1, 1, 1)
            ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
                cmap=plt.cm.jet, c=error, s=60, linewidth='0')
        else:
            ax = fig.add_subplot(1, 1, 1, projection='3d')
            ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error,
                cmap=plt.cm.jet, color=error01)
            ax.set_zlabel('Reconstruction error')

        ax.set_xlabel('x')
        ax.set_ylabel('y')
        encdec_type = ', '.join(act) 
        ax.set_title('Reconstruction error (' + encdec_type + ')')

        print("Correlation of robust MD and reconstruction error (" +
            str(encdec_type) + ") " + str(pearsonr(robust_md, error)))
        print("Correlation of empirical MD and reconstruction error (" +
            str(encdec_type) + ") " + str(pearsonr(empirical_md, error)))

    print("Correlation of robust MD and empirical MD " +
        str(pearsonr(robust_md, empirical_md)))

    os.remove(dataset_path)
    os.remove('outliers.pkl')

    plt.show(block=True)
n_samples = 125
n_outliers = 25
n_features = 2

# generate data
gen_cov = np.eye(n_features)
gen_cov[0, 0] = 2.
X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
# add some outliers
outliers_cov = np.eye(n_features)
outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)

# fit a Minimum Covariance Determinant (MCD) robust estimator to data
robust_cov = MinCovDet().fit(X)

# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)


# Display results
fig = pl.figure()

# Show data set
subfig1 = pl.subplot(3, 1, 1)
subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers')
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
예제 #41
0
class Outlier_detection(object):

    def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995):
        self.verbose = verbose
        self.support_fraction = support_fraction
        self.chi2 = stats.chi2
        self.mcd = MCD(store_precision = True, support_fraction = support_fraction)
        self.chi2_percentile = chi2_percentile
        
    def fit(self, X):
        """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
        self.mcd.fit(X)
        mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_  )
        d = np.array(map(mahalanobis, X)) #Mahalanobis distance values
        self.d2 = d ** 2 #MD squared
        n, self.degrees_of_freedom_ = X.shape
        self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) )
        if self.verbose:
            print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile)
            print "with support fraction %.2f."%self.support_fraction
        return self

    def plot(self,log=False, sort = False ):
        """
        Cause plotting is always fun.
        
        log: transform the distance-sq to a log ( distance-sq )
        sort: sort the data according to distnace before plotting
        ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views.
        
        """
        n = self.d2.shape[0]
        fig = plt.figure()
        
        x = np.arange( n )
        ax = fig.add_subplot(111)
 
 
        transform = (lambda x: x ) if not log else (lambda x: np.log(x))
        chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_)     
        
        chi_line = transform( chi_line )
        d2 = transform( self.d2 )
        if sort:
            isort = np.argsort( d2 )    
            ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' )
            plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" )
            
        
        else:
            ax.scatter(x, d2 )
            extreme_values = d2[ self.iextreme_values ]
            ax.scatter( x[self.iextreme_values], extreme_values, color="r" )
            
        ax.hlines( chi_line, 0, n, 
                        label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" )

        ax.legend()
        ax.set_ylabel("distance squared")
        ax.set_xlabel("observation")
        ax.set_xlim(0, self.d2.shape[0])


        plt.show()
예제 #42
0
 def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995):
     self.verbose = verbose
     self.support_fraction = support_fraction
     self.chi2 = stats.chi2
     self.mcd = MCD(store_precision = True, support_fraction = support_fraction)
     self.chi2_percentile = chi2_percentile
예제 #43
0
lm2 = ols('word_diff ~ Age + C(Centre_ID)',
         data=clean_st,subset=subset).fit()

print(lm2.summary())

# <markdowncell>

# # Snippets. Might come back to this later:

# <codecell>

from scipy.stats import pearsonr
from sklearn.covariance import MinCovDet

# just look at what's interesting for now, and drop the NAs involved
clean = st_v_merged.loc[:,['norm_diff','Interview_Suggested_Ranking_numerical_']]
clean = clean.dropna(axis=0)

# calculate robust covariance estimate, calculate what's too far away
mcd = MinCovDet()
mcd.fit(clean)

pearsonr(clean.iloc[:,0],clean.iloc[:,1])

# <codecell>

d = mcd.mahalanobis(clean)
d.sort()
d

# computation
for i, n_outliers in enumerate(range_n_outliers):
    for j in range(repeat):
        # generate data
        X = np.random.randn(n_samples, n_features)
        # add some outliers
        outliers_index = np.random.permutation(n_samples)[:n_outliers]
        outliers_offset = 10. * \
            (np.random.randint(2, size=(n_outliers, n_features)) - 0.5)
        X[outliers_index] += outliers_offset
        inliers_mask = np.ones(n_samples).astype(bool)
        inliers_mask[outliers_index] = False

        # fit a Minimum Covariance Determinant (MCD) robust estimator to data
        S = MinCovDet().fit(X)
        # compare raw robust estimates with the true location and covariance
        err_loc_mcd[i, j] = np.sum(S.location_ ** 2)
        err_cov_mcd[i, j] = S.error_norm(np.eye(n_features))
        # compare estimators learnt from the full data set with true parameters
        err_loc_emp_full[i, j] = np.sum(X.mean(0) ** 2)
        err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm(
            np.eye(n_features))
        # compare with an empirical covariance learnt from a pure data set
        # (i.e. "perfect" MCD)
        pure_X = X[inliers_mask]
        pure_location = pure_X.mean(0)
        pure_emp_cov = EmpiricalCovariance().fit(pure_X)
        err_loc_emp_pure[i, j] = np.sum(pure_location ** 2)
        err_cov_emp_pure[i, j] = pure_emp_cov.error_norm(np.eye(n_features))