Exemplo n.º 1
0
def MCD_Score(train_a, test_a, test_b):
    mcd = MinCovDet()
    mcd.fit(train_a)
    mcd_anoscore = mcd.mahalanobis(test_a)
    mcd_normalscore = mcd.mahalanobis(test_b)
    print("mcd ano score {} mcd normal score {}".format(
        mcd_anoscore, mcd_normalscore))
def MCD_ano_score():
    print("マハラノビス距離(each MCD) ano score")
    mcd = MinCovDet()
    mcd.fit(train_normal)
    mcd_anoscore = mcd.mahalanobis(test_normal)
    mcd_normalscore = mcd.mahalanobis(test_ano)
    print("mcd ano score {} mcd normal score {}".format(
        mcd_anoscore, mcd_normalscore))
Exemplo n.º 3
0
def detect(train_data: np.ndarray, test_data: np.ndarray) -> list:
    estimated_covarianvce = MinCovDet().fit(train_data)
    train_dist = estimated_covarianvce.mahalanobis(train_data)
    np_max = np.max(train_dist)

    return [
        0 if data <= np_max else 1
        for data in estimated_covarianvce.mahalanobis(test_data)
    ]
Exemplo n.º 4
0
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
                          tol_support):
    rand_gen = np.random.RandomState(0)
    data = rand_gen.randn(n_samples, n_features)
    # add some outliers
    outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
    outliers_offset = 10. * \
                      (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
    data[outliers_index] += outliers_offset
    inliers_mask = np.ones(n_samples).astype(bool)
    inliers_mask[outliers_index] = False

    pure_data = data[inliers_mask]
    # compute MCD by fitting an object
    mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
    T = mcd_fit.location_
    S = mcd_fit.covariance_
    H = mcd_fit.support_
    # compare with the estimates learnt from the inliers
    error_location = np.mean((pure_data.mean(0) - T) ** 2)
    assert (error_location < tol_loc)
    error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
    assert (error_cov < tol_cov)
    assert (np.sum(H) >= tol_support)
    assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
Exemplo n.º 5
0
def outliers_finder(data_frame: pd.DataFrame) -> pd.DataFrame:
    """
    Finding and removing outliers
    :param data_frame:
    :return:
    """
    (df_X, df_y) = splitting_dataset(data_frame)
    # Define the PCA object
    pca = PCA()

    # Run PCA on scaled data and obtain the scores array
    T = pca.fit_transform(StandardScaler().fit_transform(df_X.values))

    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(T[:, :5])

    # Get the Mahalanobis distance
    m = robust_cov.mahalanobis(T[:, :5])

    data_frame['mahalanobis'] = m

    # calculate p-value for each mahalanobis distance
    data_frame['p'] = 1 - chi2.cdf(data_frame['mahalanobis'], 3)
    data_frame.sort_values('p', ascending=False)
    Drops = (data_frame['p'] <= 0.001)
    data_frame['Drops'] = (data_frame['p'] <= 0.001)

    indexNames = data_frame[data_frame['Drops'] == True].index
    print(indexNames.size)
    data_frame.drop(indexNames, inplace=True)

    return data_frame
Exemplo n.º 6
0
def find_outliers_mahalanobis(featMatProjected,
                              extremeness=2.,
                              figsize=[8, 8],
                              saveto=None):
    """ A function to determine to return a list of outlier indices using the
        Mahalanobis distance. 
        Outlier threshold = std(Mahalanobis distance) * extremeness degree 
        [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule]
    """
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from pathlib import Path
    from sklearn.covariance import MinCovDet
    from matplotlib import pyplot as plt

    # NB: Euclidean distance puts more weight than it should on correlated variables
    # Chicken and egg situation, we can’t know they are outliers until we calculate
    # the stats of the distribution, but the stats of the distribution are skewed by outliers!
    # Mahalanobis gets around this by weighting by robust estimation of covariance matrix

    # Fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(
        featMatProjected[:, :10])  # Use the first 10 principal components

    # Get the Mahalanobis distance
    MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10])

    projectedTable = pd.DataFrame(featMatProjected[:,:10],\
                      columns=['PC' + str(n+1) for n in range(10)])

    plt.ioff() if saveto else plt.ion()
    plt.close('all')
    plt.style.use(CUSTOM_STYLE)
    sns.set_style('ticks')
    fig, ax = plt.subplots(figsize=figsize)
    ax.set_facecolor('#F7FFFF')
    plt.scatter(np.array(projectedTable['PC1']),
                np.array(projectedTable['PC2']),
                c=MahalanobisDist)  # colour PCA by Mahalanobis distance
    plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20)
    plt.colorbar()
    ax.grid(False)

    if saveto:
        saveto.parent.mkdir(exist_ok=True, parents=True)
        suffix = Path(saveto).suffix.strip('.')
        plt.savefig(saveto, format=suffix, dpi=300)
    else:
        plt.show()

    k = np.std(MahalanobisDist) * extremeness
    upper_t = np.mean(MahalanobisDist) + k
    outliers = []
    for i in range(len(MahalanobisDist)):
        if (MahalanobisDist[i] >= upper_t):
            outliers.append(i)
    print("Outliers found: %d" % len(outliers))

    return np.array(outliers)
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
                          tol_support):

    rand_gen = np.random.RandomState(0)
    data = rand_gen.randn(n_samples, n_features)
    # add some outliers
    outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
    outliers_offset = 10. * \
        (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
    data[outliers_index] += outliers_offset
    inliers_mask = np.ones(n_samples).astype(bool)
    inliers_mask[outliers_index] = False

    pure_data = data[inliers_mask]
    # compute MCD by fitting an object
    mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
    T = mcd_fit.location_
    S = mcd_fit.covariance_
    H = mcd_fit.support_
    # compare with the estimates learnt from the inliers
    error_location = np.mean((pure_data.mean(0) - T) ** 2)
    assert(error_location < tol_loc)
    error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
    assert(error_cov < tol_cov)
    assert(np.sum(H) >= tol_support)
    assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def getMahalanobisRobust(dat, critical_alpha = 0.01, good_rows = np.zeros(0)):

    '''Calculate the Mahalanobis distance from the sample vector.'''
    
    
    if good_rows.size == 0:
        good_rows = np.any(~np.isnan(dat), axis=1);
    
    #import pdb
    #pdb.set_trace()

    try:

        robust_cov = MinCovDet().fit(dat[good_rows])
        mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat))
    except ValueError:
        #this step will fail if the covariance matrix is not singular. This happens if the data is not 
        #a unimodal symetric distribution. For example there is too many small noisy particles. Therefore
        #I will take a safe option and return zeros in the mahalanobis distance if this is the case.
        mahalanobis_dist = np.zeros(dat.shape[0])

    #critial distance of the maholanobis distance using the chi-square distirbution
    #https://en.wikiversity.org/wiki/Mahalanobis%27_distance
    #http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
    maha_lim = chi2.ppf(1-critical_alpha, dat.shape[1])
    outliers = mahalanobis_dist>maha_lim
    
    return mahalanobis_dist, outliers, maha_lim
def _h_getMahalanobisRobust(dat, critical_alpha=0.01, good_rows=np.zeros(0)):
    '''Calculate the Mahalanobis distance from the sample vector.'''
    if good_rows.size == 0:
        good_rows = np.any(~np.isnan(dat), axis=1)

    try:
        dat2fit = dat[good_rows]
        assert not np.any(np.isnan(dat2fit))

        robust_cov = MinCovDet().fit(dat2fit)
        mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat))
    except ValueError:
        # this step will fail if the covariance matrix is not singular. This happens if the data is not
        # a unimodal symetric distribution. For example there is too many small noisy particles. Therefore
        # I will take a safe option and return zeros in the mahalanobis
        # distance if this is the case.
        mahalanobis_dist = np.zeros(dat.shape[0])

    # critial distance of the maholanobis distance using the chi-square distirbution
    # https://en.wikiversity.org/wiki/Mahalanobis%27_distance
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
    maha_lim = chi2.ppf(1 - critical_alpha, dat.shape[1])
    outliers = mahalanobis_dist > maha_lim

    return mahalanobis_dist, outliers, maha_lim
Exemplo n.º 10
0
class ActionDetector(object):
    """
    Publish whether the robot is in action or not to rostopic, by MT method.

    NOTE
    Before starting to detect action, some waiting time is required.
    This is preparation time to calculate mahalanobis distance.
    Reaction speed for action detection is a bit late
    because spectrum is mean of spectrogram, not right edge of spectrogram
    """

    def __init__(self):
        # Config for loading no action spectrum (noise data)
        rospack = rospkg.RosPack()
        self.train_dir = osp.join(rospack.get_path(
            'decopin_hand'), 'train_data')
        if not osp.exists(self.train_dir):
            makedirs(self.train_dir)
        self.noise_data_path = osp.join(self.train_dir, 'noise.npy')
        if not osp.exists(self.noise_data_path):
            rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path))
            exit()
        no_action_data = np.load(self.noise_data_path)
        # extract about 100 data from no_action_data
        divide = max(1, len(no_action_data) / 100)
        no_action_data = no_action_data[::divide]
        # Detect in action or not by mahalanobis distance
        self.anormal_threshold = rospy.get_param('~anormal_threshold')
        self.mcd = MinCovDet()
        self.mcd.fit(no_action_data)
        rospy.loginfo('Calc covariance matrix for Mahalanobis distance')

        # ROS
        self.bridge = CvBridge()
        self.pub = rospy.Publisher('~in_action', Bool, queue_size=1)
        self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb)

    def cb(self, msg):
        """
        Main process of NoiseSaver class
        Publish whether the robot is in action or not
        """

        # spectrogram.shape is (height, width) = (spectrum, time)
        spectrogram = self.bridge.imgmsg_to_cv2(msg)
        self.current_spectrum = np.average(spectrogram, axis=1)
        # Check whether current spectrogram is in action or not
        spectrum = self.current_spectrum[None]
        dist = self.mcd.mahalanobis(spectrum)[0]
        info_message = '(mahalanobis distance, threshold) = ({}, {})'.format(
            dist, self.anormal_threshold)
        if dist < self.anormal_threshold:
            self.in_action = False
            rospy.loginfo('No action\n' + info_message + '\n')
        else:
            self.in_action = True
            rospy.loginfo('### In action ###\n' + info_message + '\n')
        pub_msg = Bool(data=self.in_action)
        self.pub.publish(pub_msg)
Exemplo n.º 11
0
def mahalanobis_calculate(data, num_pcs):
    pca = PCA(num_pcs)
    T = pca.fit_transform(data)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(T)
    # Get the Mahalanobis distance
    m = robust_cov.mahalanobis(T)
    return m
def as7262_outliers(data, scatter_correction=None):
    data_columns = data[as7262_wavelengths]
    print(data_columns)
    # data_columns.T.plot()
    # plt.plot(data_columns.T)
    plt.show()
    if scatter_correction == "SNV":
        data_columns = processing.snv(data_columns)
    elif scatter_correction == "MSC":
        data_columns, _ = processing.msc(data_columns)

    # svm = OneClassSVM().fit_predict(snv_data)
    # print(svm)
    robust_cov = MinCovDet().fit(data_columns)
    mahal_dist = robust_cov.mahalanobis(data_columns)
    # mahal_dist = MahalanobisDist(np.array(data_columns), verbose=True)
    print(mahal_dist)


    zscore(data_columns)
    print('+++++')
    mean = np.mean(mahal_dist)
    std = 3*np.std(mahal_dist)
    print(mean, std)
    print(mean - std, mean + std)
    zscore_mahal = (mahal_dist - mean) / np.std(mahal_dist)
    # print(zscore_mahal)
    # print(zscore_mahal.max(), zscore_mahal.argmax(), data_columns.loc[zscore_mahal.argmax()])
    print('pppp')
    print(data_columns)
    print(zscore_mahal.argmax())
    outliers = data_columns.loc[zscore_mahal > 3].index
    outliers = data_columns.iloc[zscore_mahal.argmax()].name
    # print(data_columns.loc[zscore_mahal > 3].index)
    rows = data_columns.loc[outliers]
    # print(data_columns.loc[zscore_mahal.argmax()].name)
    print(data_columns.shape)
    print(rows)

    # print((mahal_dist-mahal_dist.mean()).std())
    # print(mahal_dist.std())
    # print(mahal_dist.mean() + 3*mahal_dist.std())
    # mahal_dist2 = MahalanobisDist(np.array(data_columns), verbose=True)
    n, bins, _ = plt.hist(zscore_mahal, bins=40)
    plt.show()

    # x_hist = np.linspace(min(mahal_dist), max(mahal_dist), 100)
    #
    # popt, pcov = curve_fit(gauss_function, bins[:len(n)], n, maxfev=100000, p0=[300, 0, 20])
    # new_fit = gauss_function(x_hist, *popt)
    # plt.plot(x_hist, new_fit, 'r--')
    # color = data_columns.shape[0] * ["#000000"]
    # color[data_columns.loc[zscore_mahal.argmax()].name] = "#FF0000"
    plt.plot(data_columns.T, c="black")
    plt.plot(rows.T, c="red")
    plt.plot(data_columns.mean(), c="blue", lw=4)
    # snv_data.T.plot(color=color)
    plt.show()
Exemplo n.º 13
0
def mahalanobisDistances(dm):
    reduced_data = PCA(n_components=2).fit_transform(dm)
    robust_cov = MinCovDet().fit(reduced_data)

    emp_cov = EmpiricalCovariance().fit(reduced_data)
    fig = plt.figure()
    plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
    subfig1 = plt.subplot(3, 1, 1)
    inlier_plot = subfig1.scatter(reduced_data[:, 0], reduced_data[:, 1], color='black', label='inliers')

    subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
    subfig1.set_title("Mahalanobis distances of a contaminated data set:")

    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
                     np.linspace(plt.ylim()[0], plt.ylim()[1], 100))
    zz = np.c_[xx.ravel(), yy.ravel()]

    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')

    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')

    plt.xticks(())
    plt.yticks(())

    # Plot the scores for each point
    emp_mahal = emp_cov.mahalanobis(reduced_data - np.mean(reduced_data, 0)) ** (0.33)
    subfig2 = plt.subplot(2, 2, 3)

    plt.yticks(())

    robust_mahal = robust_cov.mahalanobis(reduced_data - robust_cov.location_) ** (0.33)
    subfig3 = plt.subplot(2, 2, 4)

    plt.yticks(())

    plt.show()
def mahal_plot(e):
    first_half = e[1:len(e) - 1]
    second_half = e[2:len(e)]
    X = np.array([first_half, second_half])
    X = np.transpose(X)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(X)

    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance().fit(X)

    fig = plt.figure()

    # Show data set
    subfig1 = plt.subplot(1, 1, 1)
    inlier_plot = subfig1.scatter(first_half,
                                  second_half,
                                  color='black',
                                  label='daily diff in homes passed')

    subfig1.set_title("Mahalanobis distances of the iid invariants:")

    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(plt.xlim()[0],
                                     plt.xlim()[1], 800),
                         np.linspace(plt.ylim()[0],
                                     plt.ylim()[1], 100))

    zz = np.c_[xx.ravel(), yy.ravel()]

    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = subfig1.contour(xx,
                                      yy,
                                      np.sqrt(mahal_emp_cov),
                                      cmap=plt.cm.PuBu_r,
                                      linestyles='dashed')

    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = subfig1.contour(xx,
                                     yy,
                                     np.sqrt(mahal_robust_cov),
                                     cmap=plt.cm.YlOrBr_r,
                                     color='red',
                                     linewidth="3")

    subfig1.legend([
        emp_cov_contour.collections[1], robust_contour.collections[1],
        inlier_plot
    ], ['MLE dist', 'robust dist', 'kpis'],
                   loc="upper right",
                   borderaxespad=0)
    print(np.corrcoef(first_half, second_half))
    return (robust_cov, emp_cov)
Exemplo n.º 15
0
def get_outliers(X, chi2thr=0.975, plot=False, figurename=None):
    """ detect outliers by Mahalanobis distance
    """
    robust_cov = MinCovDet(random_state=100).fit(X)
    MD = robust_cov.mahalanobis(X)
    n_samples = len(MD)
    chi2 = stats.chi2
    degrees_of_freedom = X.shape[1]
    threshold = chi2.ppf(chi2thr, degrees_of_freedom)
    y_pred = MD > threshold
    outlierpercent = sum(y_pred) / float(n_samples)
    return outlierpercent, y_pred, MD
Exemplo n.º 16
0
def l_ratio(X, labels):
    ''' This is a meassure of how far a cluster is from neighbouring clusters
        computing the mahalanobis distance to the closest point that does not
        belong to the cluster

        ATENTION:   the covariance matrix is estimated with the robust
                    covariance (outliers not taken into account)

    Parameters
    ----------
        X : ndarray
            Data (assumed to be multivariate normal distributed)
        labels : ndarray
            Labels

    Returns
    -------
        lr : list, size(number of clusters)
            L-ratio for each cluster
    '''
    lr = list()

    # unique labels
    unique_l = set(labels).difference([-1])

    # if the set is empty, return 0
    if len(unique_l)==0:
        return -1

    # degrees of freedom
    df = len(X[0])

    # for each cluster
    for label in unique_l:
        # compute points in cluster
        Xi = X[(labels==label)]

        # number of spikes in cluster
        n = len(Xi)

        # compute points out of the cluster
        outliers = X[(labels!=label)]

        # estimate robust covariance
        mcd = MinCovDet().fit(Xi)

        # compute mahalanobis distance for outliers
        Dmcd = mcd.mahalanobis(outliers)

        # compute L-ratio
        lr.append(np.sum(1-chi2.cdf(Dmcd,df))/n)

    return lr
Exemplo n.º 17
0
def RejectOutliers(data, threshold=3):
    """
    Rejects nodal outliers based on :threshold: away from the mean based on the
    mahalanobis distance
    """
    from sklearn.covariance import MinCovDet
    clf = MinCovDet()
    clf.fit(data)
    distances = clf.mahalanobis(data)

    outliers = np.where(distances >= threshold)[0]
    inliers = np.where(distances < threshold)[0]
    return inliers, outliers
Exemplo n.º 18
0
  def __init__(self, lab_coords_x, lab_coords_y, data, i_panel, delta_scalar, params, verbose=False):
    training_data = []

    mean_x = flex.mean(lab_coords_x)
    mean_y = flex.mean(lab_coords_y)
    limit=delta_scalar * 10

    for ix in range(len(data)):
      if abs(lab_coords_x[ix] - mean_x) > limit: continue
      if abs(lab_coords_y[ix] - mean_y) > limit: continue
      if abs(data[ix])>1: continue
      training_data.append((lab_coords_x[ix],lab_coords_y[ix],data[ix]))
    if verbose: print("Training data is less",len(lab_coords_x) - len(training_data),end=" ")
    colorcode_set = []
    for ix in range(len(data)):
      colorcode_set.append((lab_coords_x[ix],lab_coords_y[ix],data[ix]))

    from sklearn.covariance import EmpiricalCovariance, MinCovDet
    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance(assume_centered=False, store_precision=True).fit(X=training_data)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet(assume_centered=False, store_precision=True).fit(X=training_data)

    features = ["Δx","Δy","ΔΨ(deg)"]
    if verbose:
      print("%3d"%i_panel,end=" ")
      print("%4d items "%(len(training_data),),end=" ")
    for idx_report in range(len(features)):
      feature = features[idx_report]
      diag_elem = math.sqrt(emp_cov.covariance_[idx_report,idx_report])
      if verbose: print( "%s=%7.2f±%6.2f"%(feature, emp_cov.location_[idx_report], diag_elem),end=" ")

    if verbose: print("%4d items:"%(flex.bool(robust_cov.support_).count(True)),end=" ")
    for idx_report in range(len(features)):
      feature = features[idx_report]
      diag_elem = math.sqrt(robust_cov.covariance_[idx_report,idx_report])
      if verbose: print( "%s=%7.2f±%6.2f"%(feature, robust_cov.location_[idx_report], diag_elem),end=" ")

    disc = flex.double(robust_cov.mahalanobis(X=colorcode_set)) # this metric represents malahanobis ** 2
    disc_select = disc < (params.residuals.mcd_filter.mahalanobis_distance)**2
    if params.residuals.mcd_filter.keep == "outliers":
      disc_select = (disc_select==False)
    if verbose: print("OK %4.1f%%"%(100*(disc_select.count(True))/len(training_data)))
    self.lab_coords_x = lab_coords_x.select(disc_select)
    self.lab_coords_y = lab_coords_y.select(disc_select)
    self.data = data.select(disc_select)
    self.n_input = len(lab_coords_x)
    self.n_output = len(self.lab_coords_x)
    self.emp_cov = emp_cov
    self.rob_cov = robust_cov
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False):
    """
    See http://scikit-learn.org/0.13/modules/outlier_detection.html#\
        fitting-an-elliptic-envelop

    for details.
    """
    if df is None and ctry is None:
        raise ValueError('Either the country or a dataframe must be supplied')
    elif df is None:
        df = load_res(ctry, weighted=weighted)
    if inliers:
        df = get_inliers(df=df)
    X = df.values
    robust_cov = MinCovDet().fit(X)
    #-----------------------------------------------------------------------------
    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance().fit(X)
    #-----------------------------------------------------------------------------
    # Display results
    fig = plt.figure()
    fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
    #-----------------------------------------------------------------------------
    # Show data set
    ax1 = fig.add_subplot(1, 1, 1)
    ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.')
    ax1.set_title(country_code[ctry])
    #-----------------------------------------------------------------------------
    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1],
                                     100),
                         np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1],
                                     100))
    zz = np.c_[xx.ravel(), yy.ravel()]
    #-----------------------------------------------------------------------------
    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')
    #-----------------------------------------------------------------------------
    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')
    ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]],
               ['MLE dist', 'robust dist'],
               loc="upper right", borderaxespad=0)
    ax1.grid()
    return (fig, ax1, ctry)
Exemplo n.º 20
0
def calcurate_mahalabinos_distance(TCKname):
    length_stats = TCKname+"-stat.txt"
    tdi_stats = TCKname+"-tdi.txt"
    cur_stats = TCKname+"-cur.txt"
    length =np.loadtxt(length_stats, delimiter = '\t')
    tdi = np.loadtxt(tdi_stats, delimiter = '\t')
    tdi = np.reciprocal(tdi)
    cur = np.loadtxt(cur_stats, delimiter = '\t')
    n_samples = length.shape[0]
    tdi = np.reshape(tdi, (n_samples,1))
    cur = np.reshape(cur, (n_samples,1))
    length = np.reshape(length, (n_samples,1))
    X = np.hstack((length, tdi, cur))
    robust_cov = MinCovDet().fit(X)
    robust_mahal = robust_cov.mahalanobis(X - robust_cov.location_)
    return robust_mahal
Exemplo n.º 21
0
def ComputeMahalanobisDistance(data):
    """Compute MahalanobisDistance and return as DataFrame

    Parameters:
    data (DataFrame): Pandas DataFrame
    
    Returns:
    DataFrame: contains mahalanobis distances with indices from data
    """
    rob_cov = MinCovDet().fit(data)
    distances = rob_cov.mahalanobis(data)
    distances = np.sqrt(distances)

    df = pd.DataFrame(data=distances,
                      columns={'distance'},
                      index=data.index.values)
    return df
Exemplo n.º 22
0
    def pre_screen(self, var, disp, thresh=10):
        """Uses Minimum Covariance Determinand / Mahalanobis distance ideas to detect outliers, loosely based on :cite:`chawla_k-means:_2013`.

        """
        fx = var.columns.names.index('file')
        feat = pd.concat((var.mean(), var.std()), 1)
        mcd = MinCovDet().fit(feat)
        md = mcd.mahalanobis(feat)
        s = set(np.where(md > thresh)[0])
        k = s.intersection(disp).union(s.intersection({0, var.shape[1]}))
        self.dispensable = list(set(disp) - k)
        if len(k) > 0:
            print(
                '\n\nThe following files have been removed from the concatenation as unnecessary outliers:\n'
            )
        for i in k:
            print(var.columns[i][fx])
        return var.drop(var.columns[list(k)], axis=1)
def MahalanobisOutliers(featMatProjected, extremeness=2., showplot=True):
    """ A function to determine to return a list of outlier indices using the
        Mahalanobis distance. 
        Outlier threshold = std(Mahalanobis distance) * extremeness degree 
        [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule]
    """
    # NB: Euclidean distance puts more weight than it should on correlated variables
    # Chicken and egg situation, we can’t know they are outliers until we calculate
    # the stats of the distribution, but the stats of the distribution are skewed outliers!
    # Mahalanobis gets around this by weighting by robust estimation of covariance matrix

    # Fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(
        featMatProjected[:, :10])  # Use the first 10 principal components
    # TODO: Make PCs to use = min(PCsToUse, FeatMatProjected.shape[1])
    # TODO: Check tutorial on Mahalanobis -> check whether distance is always positive - might be throwing away points close to the centre
    # Get the Mahalanobis distance
    MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10])

    # Colour PCA by Mahalanobis distance
    if showplot:
        plt.close('all')
        plt.rc('xtick', labelsize=15)
        plt.rc('ytick', labelsize=15)
        fig, ax = plt.subplots(figsize=[10, 10])
        ax.set_facecolor('white')
        plt.scatter(np.array(projectedTable['PC1']),
                    np.array(projectedTable['PC2']),
                    c=MahalanobisDist)
        plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20)
        plt.colorbar()

    k = np.std(MahalanobisDist) * extremeness
    upper_t = np.mean(MahalanobisDist) + k
    outliers = []
    for i in range(len(MahalanobisDist)):
        # TODO: Vectorise
        if (MahalanobisDist[i] >= upper_t):
            outliers.append(i)
    print("Outliers found: %d" % len(outliers))

    return np.array(outliers)
Exemplo n.º 24
0
def MahalanobisOutliers(featMatProjected, extremeness=2., showplot=False):
    """ A function to determine to return a list of outlier indices using the
        Mahalanobis distance. 
        Outlier threshold = std(Mahalanobis distance) * extremeness degree 
        [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule]
    """
    # NB: Euclidean distance puts more weight than it should on correlated variables
    # Chicken and egg situation, we can’t know they are outliers until we calculate
    # the stats of the distribution, but the stats of the distribution are skewed by outliers!
    # Mahalanobis gets around this by weighting by robust estimation of covariance matrix

    # Fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(
        featMatProjected[:, :10])  # Use the first 10 principal components

    # Get the Mahalanobis distance
    MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10])

    # Colour PCA by Mahalanobis distance
    if showplot:
        projectedTable = pd.DataFrame(featMatProjected[:,:10],\
                              columns=['PC' + str(n+1) for n in range(10)])
        plt.close('all')
        plt.rc('xtick', labelsize=15)
        plt.rc('ytick', labelsize=15)
        fig, ax = plt.subplots(figsize=[10, 10])
        ax.set_facecolor('#F7FFFF')
        plt.scatter(np.array(projectedTable['PC1']),
                    np.array(projectedTable['PC2']),
                    c=MahalanobisDist)
        plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20)
        plt.colorbar()

    k = np.std(MahalanobisDist) * extremeness
    upper_t = np.mean(MahalanobisDist) + k
    outliers = []
    for i in range(len(MahalanobisDist)):
        if (MahalanobisDist[i] >= upper_t):
            outliers.append(i)
    print("Outliers found: %d" % len(outliers))

    return np.array(outliers)
Exemplo n.º 25
0
def outlier_measure(X, method="robust_covar"):
    """
    outlier_prediction
    """
    if method == "robust_covar":
        robust_cov = MinCovDet().fit(X)
        measure = np.sqrt(robust_cov.mahalanobis(X))
        offset = 3
    elif method == "isolation_forest":
        clf = IsolationForest(behaviour="new", contamination="auto")
        y_pred = clf.fit(X).predict(X)
        measure = -clf.score_samples(X)
        offset = -clf.offset_
    elif method == "local_outlier_detection":
        clf = LOF(contamination="auto")
        y_pred = clf.fit_predict(X)
        measure = -clf.negative_outlier_factor_
        offset = -clf.offset_
    assignment = np.where(measure < offset, 1, 0)
    return measure, offset, assignment
Exemplo n.º 26
0
def estimateGaussian(nb_objects_init, nb_objects_final, thr, who, genes, siRNA,
                     loadingFolder = '../resultData/thrivisions/predictions',
                     threshold=0.05,):
    
    arr=np.vstack((thr, nb_objects_init, nb_objects_final)).T    
    #deleting siRNAs that have only one experiment
    print len(siRNA)
    all_=Counter(siRNA);siRNA = np.array(siRNA)
    toDelsi=filter(lambda x: all_[x]==1, all_)
    toDelInd=[]
    for si in toDelsi:
        toDelInd.extend(np.where(siRNA==si)[0])
    print len(toDelInd)
    dd=dict(zip(range(4), [arr, who, genes, siRNA]))
    for array_ in dd:
        dd[array_]=np.delete(dd[array_],toDelInd,0 )
    arr, who, genes, siRNA = [dd[el] for el in range(4)]
    
    print arr.shape
    
    arr_ctrl=arr[np.where(np.array(genes)=='ctrl')]
    ctrlcov=MinCovDet().fit(arr_ctrl)
    
    robdist= ctrlcov.mahalanobis(arr)*np.sign(arr[:,0]-np.mean(arr[:,0]))
    new_siRNA=np.array(siRNA)[np.where((genes!='ctrl')&(robdist>0))]
    pval,qval =empiricalPvalues(np.absolute(robdist[np.where(genes=='ctrl')])[:, np.newaxis],\
                           robdist[np.where((genes!='ctrl')&(robdist>0))][:, np.newaxis],\
                           folder=loadingFolder, name="thrivision", sup=True, also_pval=True)
    assert new_siRNA.shape==qval.shape
    hits=Counter(new_siRNA[np.where(qval<threshold)[0]])
    
    hits=filter(lambda x: float(hits[x])/all_[x]>=0.5, hits)
    gene_hits = [genes[list(siRNA).index(el)] for el in hits]
    gene_hits=Counter(gene_hits)
    
    return robdist, pval,qval, hits, gene_hits
                               color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")

# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(pl.xlim()[0], pl.xlim()[1], 100),
                     np.linspace(pl.ylim()[0], pl.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]

mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=pl.cm.PuBu_r,
                                  linestyles='dashed')

mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=pl.cm.YlOrBr_r, linestyles='dotted')

subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
                inlier_plot, outlier_plot],
               ['MLE dist', 'robust dist', 'inliers', 'outliers'],
               loc="upper right", borderaxespad=0)
pl.xticks(())
pl.yticks(())

# Plot the scores for each point
emp_mahal = emp_cov.mahalanobis(X - np.mean(X, 0)) ** (0.33)
subfig2 = pl.subplot(2, 2, 3)
subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
Exemplo n.º 28
0
                               color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")

# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
                     np.linspace(plt.ylim()[0], plt.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]

mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')

mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')

subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
                inlier_plot, outlier_plot],
               ['MLE dist', 'robust dist', 'inliers', 'outliers'],
               loc="upper right", borderaxespad=0)
plt.xticks(())
plt.yticks(())

# Plot the scores for each point
# emp_mahal = emp_cov.mahalanobis(X - np.mean(X, 0)) ** (0.33)
# subfig2 = plt.subplot(2, 2, 3)
# subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
Exemplo n.º 29
0
def main():
    parser = argparse.ArgumentParser(
        description='Plot outlier-like distances for a 2-dimensional dataset')
    parser.add_argument('dataset',
                        type=argparse.FileType('r'),
                        help='a CSV file containing the dataset')
    parser.add_argument(
        '--plot',
        type=str,
        choices=['train', 'grid'],
        default='grid',
        help='plot the dataset or a grid evenly distributed over its span')
    parser.add_argument('--plotdims',
                        type=int,
                        choices=[2, 3],
                        default=2,
                        help='the number of dimensions to plot')

    args = parser.parse_args()

    X = np.loadtxt(args.dataset, delimiter=',')
    fig = plt.figure()

    xformer = NullTransformer()

    if X.shape[1] > 2:
        xformer = PCA(n_components=2)
        X = xformer.fit_transform(X)

    if args.plotdims == 2:
        plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
    else:
        plt.scatter(X[:, 0], X[:, 1])
    plt.show(block=False)

    path_to_script = os.path.realpath(__file__)
    dir_of_script = os.path.dirname(path_to_script)
    dataset_path = dir_of_script + '/outliers.npy'
    np.save(dataset_path, X)

    ###########################################################################
    # Train autoencoder with the n samples until convergence.  Run
    # evenly distributed samples through the autoencoder and compute
    # their reconstruction error.
    ###########################################################################

    maxseq_orig = np.max(X)
    minseq_orig = np.min(X)
    seqrange = np.abs(maxseq_orig - minseq_orig)
    maxseq = maxseq_orig + 0.5 * seqrange
    minseq = minseq_orig - 0.5 * seqrange
    print("minseq", minseq, "maxseq", maxseq)
    if args.plot == 'grid':
        seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
        Xplot = np.array([_ for _ in product(seq, seq)])
    else:
        Xplot = X

    robust_cov = MinCovDet().fit(X)
    robust_md = robust_cov.mahalanobis(Xplot)

    empirical_cov = EmpiricalCovariance().fit(X)
    empirical_md = empirical_cov.mahalanobis(Xplot)

    # Assume Xplot is at least 2-dimensional.
    if Xplot.shape[1] > 2:
        Xplot2d = bh_sne(Xplot)
    else:
        Xplot2d = Xplot

    robust_md01 = robust_md - np.nanmin(robust_md)
    robust_md01 = robust_md01 / np.nanmax(robust_md01)

    empirical_md01 = empirical_md - np.nanmin(empirical_md)
    empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0],
                   Xplot2d[:, 1],
                   cmap=plt.cm.jet,
                   c=robust_md01,
                   s=60,
                   linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0],
                        Xplot2d[:, 1],
                        robust_md01,
                        cmap=plt.cm.jet,
                        color=robust_md01)
        ax.set_zlabel('Mahalanobis distance')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (robust covariance)')

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0],
                   Xplot2d[:, 1],
                   cmap=plt.cm.jet,
                   c=empirical_md01,
                   s=60,
                   linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0],
                        Xplot2d[:, 1],
                        empirical_md01,
                        cmap=plt.cm.jet,
                        color=empirical_md01)
        ax.set_zlabel('Mahalanobis distance')

    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (empirical covariance)')

    enc_dec = [
        # tanh encoder, linear decoder
        ['tanh', 'linear'],
        # sigmoid encoder, linear decoder
        ['sigmoid', 'linear'],
        #######################################################################
        # The reconstruction error of the autoencoders trained with the
        # remaining commented-out pairs don't seem to match Mahalanobis
        # distance very well.  Feel free to uncomment them to see for
        # yourself.
        # linear encoder, linear decoder
        # ['linear', 'linear'],
        # tanh encoder, tanh decoder
        # ['tanh', 'tanh'],
        # tanh encoder, sigmoid decoder
        # ['tanh', 'sigmoid'],
        # sigmoid encoder, tanh decoder
        # ['sigmoid', 'tanh'],
        # sigmoid encoder, sigmoid decoder
        # ['sigmoid', 'sigmoid']
        #######################################################################
    ]

    for i, act in enumerate(enc_dec):
        enc, dec = act
        if dec == 'linear':
            dec = None
        model = train_autoencoder(dataset_path,
                                  act_enc=enc,
                                  act_dec=dec,
                                  nvis=X.shape[1],
                                  nhid=16)

        Xshared = theano.shared(np.asarray(Xplot, dtype=theano.config.floatX),
                                borrow=True)
        f = theano.function([], outputs=model.reconstruct(Xshared))
        fit = f()
        error = reconstruction_error(Xplot, fit)

        error01 = error - np.nanmin(error)
        error01 = error01 / np.nanmax(error01)

        fig = plt.figure()
        if args.plotdims == 2:
            ax = fig.add_subplot(1, 1, 1)
            ax.scatter(Xplot2d[:, 0],
                       Xplot2d[:, 1],
                       cmap=plt.cm.jet,
                       c=error,
                       s=60,
                       linewidth='0')
        else:
            ax = fig.add_subplot(1, 1, 1, projection='3d')
            ax.plot_trisurf(Xplot2d[:, 0],
                            Xplot2d[:, 1],
                            error,
                            cmap=plt.cm.jet,
                            color=error01)
            ax.set_zlabel('Reconstruction error')

        ax.set_xlabel('x')
        ax.set_ylabel('y')
        encdec_type = ', '.join(act)
        ax.set_title('Reconstruction error (' + encdec_type + ')')

        print("Correlation of robust MD and reconstruction error (" +
              str(encdec_type) + ") " + str(pearsonr(robust_md, error)))
        print("Correlation of empirical MD and reconstruction error (" +
              str(encdec_type) + ") " + str(pearsonr(empirical_md, error)))

    print("Correlation of robust MD and empirical MD " +
          str(pearsonr(robust_md, empirical_md)))

    os.remove(dataset_path)
    os.remove('outliers.pkl')

    plt.show(block=True)
Exemplo n.º 30
0
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0],
                                 plt.xlim()[1], 100),
                     np.linspace(plt.ylim()[0],
                                 plt.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]

mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx,
                                  yy,
                                  np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')

mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx,
                                 yy,
                                 np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r,
                                 linestyles='dotted')

subfig1.legend([
    emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot,
    outlier_plot
], ['MLE dist', 'robust dist', 'inliers', 'outliers'],
               loc="upper right",
               borderaxespad=0)
plt.xticks(())
plt.yticks(())
Exemplo n.º 31
0
class Grid:
    def __init__(self, dim=10, noise=0.1, outliers=0):
        self.points = create_grid(dim, noise, outliers)
        self.polardata = np.zeros((len(self.points), 2))
        self.polar_cov = 0
        self.inlier_points = np.zeros((len(self.points), 2))
        self.inlier_indicies = np.zeros((len(self.points), 1))
        self.normalized_points = np.zeros((len(self.points), 2))
        self.theta = 0
        self.step_size = 1
        self.linedata = np.zeros((3 * len(self.points), 2))
        self.normalized_point_ids = []
        self.bounds = [0, 0, 0, 0]

    def step(self, rotation=0):
        self.points = rotate(self.points, rotation)

    def analyze(self, mahalanobis_tolerance=2):
        self.inlier_points = np.zeros((len(self.points), 2))
        for id1 in range(len(self.points)):
            id2 = closest_point(self.points, self.points[id1], id1)[0]

            #keep lines fro plotting purposes
            self.linedata[3 * id1] = self.points[id1]
            self.linedata[3 * id1 + 1] = self.points[id2]
            self.linedata[3 * id1 + 2] = [None, None]

            # we are repeating every pi/2, so we compress the angle space by 4x
            a = 4 * math.atan2((self.points[id1, 1] - self.points[id2, 1]),
                               (self.points[id1, 0] - self.points[id2, 0]))
            r = np.linalg.norm(self.points[id1] - self.points[id2])
            self.polardata[id1] = [r * math.cos(a), r * math.sin(a)]

        #find the minimal covariance inlier cluster
        self.polar_cov = MinCovDet().fit(self.polardata)

        # extract the grid angle and size.  angle is divided by 4 because
        # we previously scaled it up to repeat every 90 deg
        self.theta = math.atan2(-self.polar_cov.location_[1],
                                self.polar_cov.location_[0]) / 4
        self.step_size = np.linalg.norm(self.polar_cov.location_)

        # extract inlier points
        polar_mahal = self.polar_cov.mahalanobis(self.polardata)**(0.33)
        inlier_count = 0
        for i in range(len(polar_mahal)):
            if polar_mahal[
                    i] < mahalanobis_tolerance:  # stdev tolerance to outliers
                self.inlier_points[inlier_count] = self.points[i]
                self.inlier_indicies[inlier_count] = i
                inlier_count += 1

        self.normalized_points = rotate(self.inlier_points[:inlier_count],
                                        -self.theta) / self.step_size

        #enumerate grid IDs
        origin_id = closest_point(self.normalized_points,
                                  np.mean(self.normalized_points))[0]
        self.normalized_points = self.normalized_points - self.normalized_points[
            origin_id]
        inlier_count = 0

        self.bounds = [sys.maxint, sys.maxint, -sys.maxint, -sys.maxint]
        for p in self.normalized_points:
            x = round(p[0])
            y = round(p[1])
            d = np.linalg.norm(p - [x, y])
            if d < 0.4:  #tolerance from unit position
                self.normalized_points[inlier_count] = [x, y]
                if (x < self.bounds[0]):
                    self.bounds[0] = x
                if (x > self.bounds[2]):
                    self.bounds[2] = x
                if (y < self.bounds[1]):
                    self.bounds[1] = y
                if (y > self.bounds[3]):
                    self.bounds[3] = y
                inlier_count += 1

        self.normalized_points = self.normalized_points[:inlier_count]
Exemplo n.º 32
0
subfig1.set_title("Mahalanobis distances of a contaminated data set:")

# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 800),
                     np.linspace(plt.ylim()[0], plt.ylim()[1], 100))

zz = np.c_[xx.ravel(), yy.ravel()]

mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')

mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')

subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
                inlier_plot],
               ['MLE dist', 'robust dist', 'kpis'],
               loc="upper right", borderaxespad=0)
print(np.corrcoef(first_half,second_half))
#%%

full_data = full_data.drop(['Year', 'Month', 'Day', 'Data Quality','Max Temp (°C)', 
                            'Max Temp Flag', 'Min Temp (°C)', 'Min Temp Flag',
                            'Mean Temp Flag',
                            'Heat Deg Days Flag', 
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
subfig1.legend(loc="upper right")

emp_mahal = emp_cov.mahalanobis(X) ** (0.33)
subfig2 = pl.subplot(2, 2, 3)
subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
subfig2.plot(1.26 * np.ones(n_samples - n_outliers),
             emp_mahal[:-n_outliers], '+k', markeredgewidth=1)
subfig2.plot(2.26 * np.ones(n_outliers),
             emp_mahal[-n_outliers:], '+k', markeredgewidth=1)
subfig2.axes.set_xticklabels(('inliers', 'outliers'), size=11)
subfig2.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$")
subfig2.set_title("1. from non-robust estimates\n(Maximum Likelihood)")

robust_mahal = robust_cov.mahalanobis(X) ** (0.33)
subfig3 = pl.subplot(2, 2, 4)
subfig3.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]],
                widths=.25)
subfig3.plot(1.26 * np.ones(n_samples - n_outliers),
             robust_mahal[:-n_outliers], '+k', markeredgewidth=1)
subfig3.plot(2.26 * np.ones(n_outliers),
             robust_mahal[-n_outliers:], '+k', markeredgewidth=1)
subfig3.axes.set_xticklabels(('inliers', 'outliers'), size=11)
subfig3.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$")
subfig3.set_title("2. from robust estimates\n(Minimum Covariance Determinant)")

pl.show()
Exemplo n.º 34
0
def main():
    parser = argparse.ArgumentParser(
        description='Plot outlier-like distances for a 2-dimensional dataset')
    parser.add_argument(
        'dataset', type=argparse.FileType('r'),
        help='a CSV file containing the dataset')
    parser.add_argument(
        '--plot', type=str, choices=['train', 'grid'], default='grid',
        help='plot the dataset or a grid evenly distributed over its span')
    parser.add_argument(
        '--plotdims', type=int, choices=[2, 3], default=2,
        help='the number of dimensions to plot')

    args = parser.parse_args()

    X = np.loadtxt(args.dataset, delimiter=',')
    fig = plt.figure()

    xformer = NullTransformer()

    if X.shape[1] > 2:
        xformer = PCA(n_components=2)
        X = xformer.fit_transform(X)

    if args.plotdims == 2:
        plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
    else:
        plt.scatter(X[:, 0], X[:, 1])
    plt.show(block=False)

    path_to_script = os.path.realpath(__file__)
    dir_of_script = os.path.dirname(path_to_script)
    dataset_path = dir_of_script + '/outliers.npy'
    np.save(dataset_path, X)
    
    ###########################################################################
    # Train autoencoder with the n samples until convergence.  Run
    # evenly distributed samples through the autoencoder and compute
    # their reconstruction error.
    ###########################################################################

    maxseq_orig = np.max(X)
    minseq_orig = np.min(X)
    seqrange = np.abs(maxseq_orig - minseq_orig)
    maxseq = maxseq_orig + 0.5 * seqrange
    minseq = minseq_orig - 0.5 * seqrange
    print("minseq", minseq, "maxseq", maxseq)
    if args.plot == 'grid':
        seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
        Xplot = np.array([_ for _ in product(seq, seq)])
    else:
        Xplot = X

    robust_cov = MinCovDet().fit(X)
    robust_md = robust_cov.mahalanobis(Xplot)

    empirical_cov = EmpiricalCovariance().fit(X)
    empirical_md = empirical_cov.mahalanobis(Xplot)

    # Assume Xplot is at least 2-dimensional.
    if Xplot.shape[1] > 2:
        Xplot2d = bh_sne(Xplot)
    else:
        Xplot2d = Xplot

    robust_md01 = robust_md - np.nanmin(robust_md)
    robust_md01 = robust_md01 / np.nanmax(robust_md01)

    empirical_md01 = empirical_md - np.nanmin(empirical_md)
    empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 
            cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01,
            cmap=plt.cm.jet, color=robust_md01)
        ax.set_zlabel('Mahalanobis distance')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (robust covariance)')

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 
            cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01,
            cmap=plt.cm.jet, color=empirical_md01)
        ax.set_zlabel('Mahalanobis distance')

    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (empirical covariance)')
    
    enc_dec = [
        # tanh encoder, linear decoder
        ['tanh', 'linear'],
        # sigmoid encoder, linear decoder
        ['sigmoid', 'linear'],
        #######################################################################
        # The reconstruction error of the autoencoders trained with the
        # remaining commented-out pairs don't seem to match Mahalanobis
        # distance very well.  Feel free to uncomment them to see for
        # yourself.
        # linear encoder, linear decoder
        # ['linear', 'linear'],
        # tanh encoder, tanh decoder
        # ['tanh', 'tanh'],
        # tanh encoder, sigmoid decoder
        # ['tanh', 'sigmoid'],
        # sigmoid encoder, tanh decoder
        # ['sigmoid', 'tanh'],
        # sigmoid encoder, sigmoid decoder
        # ['sigmoid', 'sigmoid']
        #######################################################################
    ]
    
    for i, act in enumerate(enc_dec):
        enc, dec = act
        if dec == 'linear':
            dec = None
        model = train_autoencoder(dataset_path,
            act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16)
        
        Xshared = theano.shared(
            np.asarray(Xplot, dtype=theano.config.floatX), borrow=True)
        f = theano.function([], outputs=model.reconstruct(Xshared))
        fit = f()
        error = reconstruction_error(Xplot, fit)

        error01 = error - np.nanmin(error)
        error01 = error01 / np.nanmax(error01)
        
        fig = plt.figure()
        if args.plotdims == 2:
            ax = fig.add_subplot(1, 1, 1)
            ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
                cmap=plt.cm.jet, c=error, s=60, linewidth='0')
        else:
            ax = fig.add_subplot(1, 1, 1, projection='3d')
            ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error,
                cmap=plt.cm.jet, color=error01)
            ax.set_zlabel('Reconstruction error')

        ax.set_xlabel('x')
        ax.set_ylabel('y')
        encdec_type = ', '.join(act) 
        ax.set_title('Reconstruction error (' + encdec_type + ')')

        print("Correlation of robust MD and reconstruction error (" +
            str(encdec_type) + ") " + str(pearsonr(robust_md, error)))
        print("Correlation of empirical MD and reconstruction error (" +
            str(encdec_type) + ") " + str(pearsonr(empirical_md, error)))

    print("Correlation of robust MD and empirical MD " +
        str(pearsonr(robust_md, empirical_md)))

    os.remove(dataset_path)
    os.remove('outliers.pkl')

    plt.show(block=True)
Exemplo n.º 35
0
class Grid:
    def __init__(self, dim=10, noise=0.1, outliers=0):
        self.points = create_grid(dim, noise, outliers)
        self.polardata = np.zeros((len(self.points), 2))
        self.polar_cov = 0
        self.inlier_points = np.zeros((len(self.points), 2))
        self.inlier_indicies = np.zeros((len(self.points), 1))
        self.normalized_points = np.zeros((len(self.points), 2))
        self.theta = 0
        self.step_size = 1
        self.linedata = np.zeros((3*len(self.points), 2))
        self.normalized_point_ids = []
        self.bounds = [0, 0, 0, 0]

    def step(self, rotation=0):
        self.points = rotate(self.points, rotation)

    def analyze(self, mahalanobis_tolerance=2):
        self.inlier_points = np.zeros((len(self.points), 2))
        for id1 in range(len(self.points)):
            id2 = closest_point(self.points, self.points[id1], id1)[0]

            #keep lines fro plotting purposes
            self.linedata[3*id1] = self.points[id1]
            self.linedata[3*id1+1] = self.points[id2]
            self.linedata[3*id1+2] = [None, None]

            # we are repeating every pi/2, so we compress the angle space by 4x
            a = 4*math.atan2((self.points[id1, 1] - self.points[id2, 1]), (self.points[id1, 0] - self.points[id2, 0]))
            r = np.linalg.norm(self.points[id1] - self.points[id2])
            self.polardata[id1] = [r*math.cos(a), r*math.sin(a)]

        #find the minimal covariance inlier cluster
        self.polar_cov = MinCovDet().fit(self.polardata)

        # extract the grid angle and size.  angle is divided by 4 because
        # we previously scaled it up to repeat every 90 deg
        self.theta = math.atan2(-self.polar_cov.location_[1], self.polar_cov.location_[0])/4
        self.step_size = np.linalg.norm(self.polar_cov.location_)

        # extract inlier points
        polar_mahal = self.polar_cov.mahalanobis(self.polardata)**(0.33)
        inlier_count = 0
        for i in range(len(polar_mahal)):
            if polar_mahal[i] < mahalanobis_tolerance: # stdev tolerance to outliers
                self.inlier_points[inlier_count] = self.points[i]
                self.inlier_indicies[inlier_count] = i
                inlier_count += 1

        self.normalized_points = rotate(self.inlier_points[:inlier_count], -self.theta)/self.step_size

        #enumerate grid IDs
        origin_id = closest_point(self.normalized_points, np.mean(self.normalized_points))[0]
        self.normalized_points = self.normalized_points - self.normalized_points[origin_id]
        inlier_count = 0

        self.bounds = [sys.maxint, sys.maxint, -sys.maxint, -sys.maxint]
        for p in self.normalized_points:
            x = round(p[0])
            y = round(p[1])
            d = np.linalg.norm(p-[x, y])
            if d < 0.4: #tolerance from unit position
                self.normalized_points[inlier_count] = [x, y]
                if (x < self.bounds[0]):
                    self.bounds[0] = x
                if (x > self.bounds[2]):
                    self.bounds[2] = x
                if (y < self.bounds[1]):
                    self.bounds[1] = y
                if (y > self.bounds[3]):
                    self.bounds[3] = y
                inlier_count += 1

        self.normalized_points = self.normalized_points[:inlier_count]
Exemplo n.º 36
0
lm2 = ols('word_diff ~ Age + C(Centre_ID)',
         data=clean_st,subset=subset).fit()

print(lm2.summary())

# <markdowncell>

# # Snippets. Might come back to this later:

# <codecell>

from scipy.stats import pearsonr
from sklearn.covariance import MinCovDet

# just look at what's interesting for now, and drop the NAs involved
clean = st_v_merged.loc[:,['norm_diff','Interview_Suggested_Ranking_numerical_']]
clean = clean.dropna(axis=0)

# calculate robust covariance estimate, calculate what's too far away
mcd = MinCovDet()
mcd.fit(clean)

pearsonr(clean.iloc[:,0],clean.iloc[:,1])

# <codecell>

d = mcd.mahalanobis(clean)
d.sort()
d

Exemplo n.º 37
0
import numpy as np
from sklearn.covariance import MinCovDet
from heapq import nlargest

from load_data import data, norm_data, reformat

NUMBER_OF_ANOMALIES = 10

robust_cov = MinCovDet().fit(norm_data)
mahal_robust_cov = enumerate(robust_cov.mahalanobis(norm_data))
anomalies = nlargest(NUMBER_OF_ANOMALIES, mahal_robust_cov, key=lambda _: _[1])
print(anomalies)
Exemplo n.º 38
0
    for k1 in choosen:
        a1, a2, a3 = extractStats(k1[2], fr)
        a1.extend(a2)
        a1.extend(a3)
        ab.append(a1)

    rr = np.array(ab)

    #print(dataNameList_f)
    if dataNameList_f:
        print('fitting ' + str(len(dataNameList_f)) + ' new data')
        mcd.fit(rr[:-1 * nrAnalysis - 1, :])
    else:
        print('no new data')

    arn = mcd.mahalanobis(rr[-1 * nrAnalysis - 1:-1, :] -
                          mcd.location_)**(0.33)
    aro = mcd.mahalanobis(rr[:-1 * nrAnalysis - 1, :] - mcd.location_)**(0.33)

    print(np.median(aro[mcd.support_]))

    ax1.clear()
    ax1.scatter(rr[:-1 * nrAnalysis - 1, [0]],
                rr[:-1 * nrAnalysis - 1, [3]],
                marker='+')
    ax1.scatter(rr[-1 * nrAnalysis - 1:-1, [0]],
                rr[-1 * nrAnalysis - 1:-1, [3]],
                marker='o',
                c='r')

    #ax1.scatter(*mcd.location_,c='r',s=4)
Exemplo n.º 39
0
class MCD(BaseDetector):
    """Detecting outliers in a Gaussian distributed dataset using
    Minimum Covariance Determinant (MCD): robust estimator of covariance.

    The Minimum Covariance Determinant covariance estimator is to be applied
    on Gaussian-distributed data, but could still be relevant on data
    drawn from a unimodal, symmetric distribution. It is not meant to be used
    with multi-modal data (the algorithm used to fit a MinCovDet object is
    likely to fail in such a case).
    One should consider projection pursuit methods to deal with multi-modal
    datasets.

    First fit a minimum covariance determinant model and then compute the
    Mahalanobis distance as the outlier degree of the data

    See :cite:`rousseeuw1999fast,hardin2004outlier` for details.

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    store_precision : bool
        Specify if the estimated precision is stored.

    assume_centered : Boolean
        If True, the support of the robust location and the covariance
        estimates is computed, and a covariance estimate is recomputed from
        it, without centering the data.
        Useful to work with data whose mean is significantly equal to
        zero but is not exactly zero.
        If False, the robust location and covariance are directly computed
        with the FastMCD algorithm without additional treatment.

    support_fraction : float, 0 < support_fraction < 1
        The proportion of points to be included in the support of the raw
        MCD estimate. Default is None, which implies that the minimum
        value of support_fraction will be used within the algorithm:
        [n_sample + n_features + 1] / 2

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Attributes
    ----------
    raw_location_ : array-like, shape (n_features,)
        The raw robust estimated location before correction and re-weighting.

    raw_covariance_ : array-like, shape (n_features, n_features)
        The raw robust estimated covariance before correction and re-weighting.

    raw_support_ : array-like, shape (n_samples,)
        A mask of the observations that have been used to compute
        the raw robust estimates of location and shape, before correction
        and re-weighting.

    location_ : array-like, shape (n_features,)
        Estimated robust location

    covariance_ : array-like, shape (n_features, n_features)
        Estimated robust covariance matrix

    precision_ : array-like, shape (n_features, n_features)
        Estimated pseudo inverse matrix.
        (stored only if store_precision is True)

    support_ : array-like, shape (n_samples,)
        A mask of the observations that have been used to compute
        the robust estimates of location and shape.

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted. Mahalanobis distances of the training set (on which
        `:meth:`fit` is called) observations.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, contamination=0.1, store_precision=True,
                 assume_centered=False, support_fraction=None,
                 random_state=None):
        super(MCD, self).__init__(contamination=contamination)
        self.store_precision = store_precision
        self.assume_centered = assume_centered
        self.support_fraction = support_fraction
        self.random_state = random_state

    # noinspection PyIncorrectDocstring
    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = MinCovDet(store_precision=self.store_precision,
                                   assume_centered=self.assume_centered,
                                   support_fraction=self.support_fraction,
                                   random_state=self.random_state)
        self.detector_.fit(X=X, y=y)

        # Use mahalanabis distance as the outlier score
        self.decision_scores_ = self.detector_.dist_
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)

        # Computer mahalanobis distance of the samples
        return self.detector_.mahalanobis(X)

    @property
    def raw_location_(self):
        """The raw robust estimated location before correction and
        re-weighting.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.raw_location_

    @property
    def raw_covariance_(self):
        """The raw robust estimated location before correction and
        re-weighting.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.raw_covariance_

    @property
    def raw_support_(self):
        """A mask of the observations that have been used to compute
        the raw robust estimates of location and shape, before correction
        and re-weighting.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.raw_support_

    @property
    def location_(self):
        """Estimated robust location.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.location_

    @property
    def covariance_(self):
        """Estimated robust covariance matrix.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.covariance_

    @property
    def precision_(self):
        """ Estimated pseudo inverse matrix.
        (stored only if store_precision is True)

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.precision_

    @property
    def support_(self):
        """A mask of the observations that have been used to compute
        the robust estimates of location and shape.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.support_