def test_mcd_issue1127(): # Check that the code does not break with X.shape = (3, 1) # (i.e. n_support = n_samples) rnd = np.random.RandomState(0) X = rnd.normal(size=(3, 1)) mcd = MinCovDet() mcd.fit(X)
def MCD_Score(train_a, test_a, test_b): mcd = MinCovDet() mcd.fit(train_a) mcd_anoscore = mcd.mahalanobis(test_a) mcd_normalscore = mcd.mahalanobis(test_b) print("mcd ano score {} mcd normal score {}".format( mcd_anoscore, mcd_normalscore))
def test_mcd_increasing_det_warning(): # Check that a warning is raised if we observe increasing determinants # during the c_step. In theory the sequence of determinants should be # decreasing. Increasing determinants are likely due to ill-conditioned # covariance matrices that result in poor precision matrices. X = [[5.1, 3.5, 1.4, 0.2], [4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5.0, 3.6, 1.4, 0.2], [4.6, 3.4, 1.4, 0.3], [5.0, 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2], [4.9, 3.1, 1.5, 0.1], [5.4, 3.7, 1.5, 0.2], [4.8, 3.4, 1.6, 0.2], [4.8, 3.0, 1.4, 0.1], [4.3, 3.0, 1.1, 0.1], [5.1, 3.5, 1.4, 0.3], [5.7, 3.8, 1.7, 0.3], [5.4, 3.4, 1.7, 0.2], [4.6, 3.6, 1.0, 0.2], [5.0, 3.0, 1.6, 0.2], [5.2, 3.5, 1.5, 0.2]] mcd = MinCovDet(random_state=1) warn_msg = "Determinant has increased" with pytest.warns(RuntimeWarning, match=warn_msg): mcd.fit(X)
def _naiveMCD(self, dataset, thresh=3): types = LoLTypeInference().getDataTypes(dataset) qdataset = [[d[i] for i, t in enumerate(types) if t == 'numerical'] for d in dataset] X = featurize(qdataset, [t for t in types if t == 'numerical']) xshape = np.shape(X) #for conditioning problems with the estimate Xsamp = X + 0.01 * np.random.randn(xshape[0], xshape[1]) m = MinCovDet() m.fit(Xsamp) sigma = np.linalg.inv(m.covariance_) mu = np.mean(X, axis=0) results = [] for i in range(0, xshape[0]): val = np.squeeze((X[i, :] - mu) * sigma * (X[i, :] - mu).T)[0, 0] results.append([str(val)]) e = ErrorDetector(results, modules=[QuantitativeErrorModule], config=[{ 'thresh': thresh }]) e.fit() return set([error['cell'][0] for error in e])
class ActionDetector(object): """ Publish whether the robot is in action or not to rostopic, by MT method. NOTE Before starting to detect action, some waiting time is required. This is preparation time to calculate mahalanobis distance. Reaction speed for action detection is a bit late because spectrum is mean of spectrogram, not right edge of spectrogram """ def __init__(self): # Config for loading no action spectrum (noise data) rospack = rospkg.RosPack() self.train_dir = osp.join(rospack.get_path( 'decopin_hand'), 'train_data') if not osp.exists(self.train_dir): makedirs(self.train_dir) self.noise_data_path = osp.join(self.train_dir, 'noise.npy') if not osp.exists(self.noise_data_path): rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path)) exit() no_action_data = np.load(self.noise_data_path) # extract about 100 data from no_action_data divide = max(1, len(no_action_data) / 100) no_action_data = no_action_data[::divide] # Detect in action or not by mahalanobis distance self.anormal_threshold = rospy.get_param('~anormal_threshold') self.mcd = MinCovDet() self.mcd.fit(no_action_data) rospy.loginfo('Calc covariance matrix for Mahalanobis distance') # ROS self.bridge = CvBridge() self.pub = rospy.Publisher('~in_action', Bool, queue_size=1) self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb) def cb(self, msg): """ Main process of NoiseSaver class Publish whether the robot is in action or not """ # spectrogram.shape is (height, width) = (spectrum, time) spectrogram = self.bridge.imgmsg_to_cv2(msg) self.current_spectrum = np.average(spectrogram, axis=1) # Check whether current spectrogram is in action or not spectrum = self.current_spectrum[None] dist = self.mcd.mahalanobis(spectrum)[0] info_message = '(mahalanobis distance, threshold) = ({}, {})'.format( dist, self.anormal_threshold) if dist < self.anormal_threshold: self.in_action = False rospy.loginfo('No action\n' + info_message + '\n') else: self.in_action = True rospy.loginfo('### In action ###\n' + info_message + '\n') pub_msg = Bool(data=self.in_action) self.pub.publish(pub_msg)
def MCD_ano_score(): print("マハラノビス距離(each MCD) ano score") mcd = MinCovDet() mcd.fit(train_normal) mcd_anoscore = mcd.mahalanobis(test_normal) mcd_normalscore = mcd.mahalanobis(test_ano) print("mcd ano score {} mcd normal score {}".format( mcd_anoscore, mcd_normalscore))
def leverage(self, X): mcd = MinCovDet() mcd.fit(X) loc, cov = mcd.location_, mcd.covariance_ inversed_cov = np.linalg.inv(cov) result = np.zeros(X.shape[0]) for i, element in enumerate(X): h = np.sqrt( np.transpose(element - loc) @ inversed_cov @ (element - loc)) result[i] = h return result
def RejectOutliers(data, threshold=3): """ Rejects nodal outliers based on :threshold: away from the mean based on the mahalanobis distance """ from sklearn.covariance import MinCovDet clf = MinCovDet() clf.fit(data) distances = clf.mahalanobis(data) outliers = np.where(distances >= threshold)[0] inliers = np.where(distances < threshold)[0] return inliers, outliers
def resif(self, X): """ computes the robust empirical influence function(RESIF). Choose to use :math:`\Omega_2 = ( \\theta, \\hat{\Sigma} )` as estimator of central model. :param X: ndarray, shape(n_samples, n_features) Training data :return: ndarray, shape(n_samples,) RESIF of each sample """ mcd = MinCovDet() mcd.fit(X=X) loc, cov = mcd.location_, mcd.covariance_ inversed_cov = np.linalg.inv(cov) result = np.zeros(len(X)) for i, element in enumerate(X): h = np.sqrt( np.transpose(element - loc) @ inversed_cov @ (element - loc)) result[i] = h return result
def robust_hurst(ts, lags=100, robust_cov=True, plot=False): minCovDet = MinCovDet(assume_centered=True) n = ts.shape[0] # calculate lagged variances var_lags = np.zeros(lags - 1) for lag in range(1, lags): lagged_series = ts[lag:] - ts[:-lag] if robust_cov: minCovDet.fit(lagged_series.reshape(-1, 1)) var_lags[lag - 1] = np.asscalar(minCovDet.covariance_) else: var_lags[lag - 1] = np.dot(lagged_series, lagged_series) / (n - lag - 1) # calculate log-log slopes slopes = np.zeros(int(comb(lags - 2, 2))) cntr = 0 for i in range(1, lags - 1): for j in range(i + 1, lags - 1): slopes[cntr] = np.log( var_lags[j] / var_lags[i]) / (2 * np.log(float(j) / i)) cntr += 1 H_est = np.median(slopes) # plot if plot: plt.figure() plt.hist(slopes) plt.figure() plt.plot(np.log(range(1, lags)), np.log(var_lags)) plt.plot(np.log(range(1, lags)), np.log(range(1, lags)) * H_est) return np.median(slopes)
class myMahalanobisDistance(): def __init__(self, estimator='ML', tol=1e-6): if (estimator == 'ML'): self.estimator_ = EmpiricalCovariance(store_precision=True, assume_centered=False) elif (estimator == 'MCD'): self.estimator_ = MinCovDet(store_precision=True, assume_centered=False, support_fraction=None, random_state=0) else: self.estimator_ = None self.tol_ = tol def fit(self, X_tr): self.D_ = X_tr.shape[1] if (self.estimator_ == None): self.cov_ = np.cov(X_tr.T) if (np.linalg.matrix_rank(self.cov_) != self.D_): self.cov_ += self.tol_ * np.eye(self.D_) else: self.estimator_ = self.estimator_.fit(X_tr) self.cov_ = self.estimator_.covariance_ if (np.linalg.matrix_rank(self.cov_) != self.D_): self.cov_ += self.tol_ * np.eye(self.D_) # self.inv_ = self.estimator_.precision_ self.inv_ = np.linalg.inv(self.cov_) self = self.__setEig() return self def __setEig(self): self.lams_, self.U_ = np.linalg.eig(self.inv_) self.U_ = self.U_.T # self.lams_, self.U_ = self.lams_.astype(float), self.U_.T.astype(float) self.Lam_ = np.diag(np.sqrt(self.lams_)) self.L_ = self.U_.T.dot(self.Lam_).T # self.M_ = self.U_.T.dot(self.Lam_, self.U_) return self def mahalanobis_dist(self, x, y, p=2): if (p == 1): return np.linalg.norm(self.L_.dot(x - y), ord=1) else: return mahalanobis(x, y, self.inv_)
class OutlierMahalanobis(TransformerMixin): def __init__(self, support_fraction = 0.95, verbose = False, chi2_percentile = 0.995,qqplot=True): self.verbose = verbose self.support_fraction = support_fraction self.chi2 = stats.chi2 self.mcd = MCD(store_precision = True, support_fraction = support_fraction) self.chi2_percentile = chi2_percentile self.qqplot=qqplot def get_params(self): return {"support_fraction": self.support_fraction,"chi2_percentile":self.chi2_percentile} def set_params(self, **parameters): for key,value in parameters.items() : setattr(self,key,parameters[key]) return self def fit(self, X,y=None): """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme""" self.mcd.fit(X) d = np.array([distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ ) for p in X]) self.d2 = d**2 #MD squared n, self.degrees_of_freedom_ = X.shape self.iextreme_values = (self.d2 > self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) ) if self.verbose: print("%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile)) print("with support fraction %.2f."%self.support_fraction) pvalue=stats.kstest(self.d2, lambda x : stats.chi2.cdf(x,df=self.degrees_of_freedom_))[1] if pvalue <= 0.01: print('Attention : Très forte présomption contre l\'hypothèse nulle p_value : '+str(pvalue)) elif pvalue <= 0.05: print('Attention : Forte présomption contre l\'hypothèse nulle p_value : '+str(pvalue)) elif pvalue <= 0.1: print('Faible présomption contre l\'hypothèse nulle p_value : '+str(pvalue)) else : print('Pas de présomption contre l\'hypothèse nulle. p_value : '+str(pvalue)) if self.qqplot==True : plt.figure(figsize=(10,10)) stats.probplot(self.d2,dist=stats.chi2(df=self.degrees_of_freedom_), plot=plt) plt.title('QQ plot between Mahanalobis distance quantiles and Chi2 quantiles') plt.show() return self def transform(self,X): return X[~self.iextreme_values] def plot(self,log=False, sort = False ): """ Cause plotting is always fun. log: transform the distance-sq to a log ( distance-sq ) sort: sort the data according to distnace before plotting ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views. """ n = self.d2.shape[0] fig = plt.figure(figsize=(10,10)) x = np.arange( n ) ax = fig.add_subplot(111) transform = (lambda x: x ) if not log else (lambda x: np.log(x)) chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) chi_line = transform( chi_line ) d2 = transform( self.d2 ) if sort: isort = np.argsort( d2 ) ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' ) plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" ) else: ax.scatter(x, d2 ) extreme_values = d2[ self.iextreme_values ] ax.scatter( x[self.iextreme_values], extreme_values, color="r" ) ax.hlines( chi_line, 0, n, label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" ) ax.legend() ax.set_ylabel("distance squared") ax.set_xlabel("observation") ax.set_xlim(0, self.d2.shape[0]) plt.show()
class Outlier_detection(object): def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995): self.verbose = verbose self.support_fraction = support_fraction self.chi2 = stats.chi2 self.mcd = MCD(store_precision = True, support_fraction = support_fraction) self.chi2_percentile = chi2_percentile def fit(self, X): """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme""" self.mcd.fit(X) mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ ) d = np.array(map(mahalanobis, X)) #Mahalanobis distance values self.d2 = d ** 2 #MD squared n, self.degrees_of_freedom_ = X.shape self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) ) if self.verbose: print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile) print "with support fraction %.2f."%self.support_fraction return self def plot(self,log=False, sort = False ): """ Cause plotting is always fun. log: transform the distance-sq to a log ( distance-sq ) sort: sort the data according to distnace before plotting ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views. """ n = self.d2.shape[0] fig = plt.figure() x = np.arange( n ) ax = fig.add_subplot(111) transform = (lambda x: x ) if not log else (lambda x: np.log(x)) chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) chi_line = transform( chi_line ) d2 = transform( self.d2 ) if sort: isort = np.argsort( d2 ) ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' ) plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" ) else: ax.scatter(x, d2 ) extreme_values = d2[ self.iextreme_values ] ax.scatter( x[self.iextreme_values], extreme_values, color="r" ) ax.hlines( chi_line, 0, n, label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" ) ax.legend() ax.set_ylabel("distance squared") ax.set_xlabel("observation") ax.set_xlim(0, self.d2.shape[0]) plt.show()
class Outlier_detection(object): def __init__(self, support_fraction=0.95, verbose=True, chi2_percentile=0.995): self.verbose = verbose self.support_fraction = support_fraction self.chi2 = stats.chi2 self.mcd = MCD(store_precision=True, support_fraction=support_fraction) self.chi2_percentile = chi2_percentile def fit(self, X): """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme""" self.mcd.fit(X) mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_) d = np.array(map(mahalanobis, X)) #Mahalanobis distance values self.d2 = d**2 #MD squared n, self.degrees_of_freedom_ = X.shape self.iextreme_values = (self.d2 > self.chi2.ppf( 0.995, self.degrees_of_freedom_)) if self.verbose: print "%.3f proportion of outliers at %.3f%% chi2 percentile, " % ( self.iextreme_values.sum() / float(n), self.chi2_percentile) print "with support fraction %.2f." % self.support_fraction return self def plot(self, log=False, sort=False): """ Cause plotting is always fun. log: transform the distance-sq to a log ( distance-sq ) sort: sort the data according to distnace before plotting ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views. """ n = self.d2.shape[0] fig = plt.figure() x = np.arange(n) ax = fig.add_subplot(111) transform = (lambda x: x) if not log else (lambda x: np.log(x)) chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) chi_line = transform(chi_line) d2 = transform(self.d2) if sort: isort = np.argsort(d2) ax.scatter(x, d2[isort], alpha=0.7, facecolors='none') plt.plot(x, transform( self.chi2.ppf(np.linspace(0, 1, n), self.degrees_of_freedom_)), c="r", label="distribution assuming normal") else: ax.scatter(x, d2) extreme_values = d2[self.iextreme_values] ax.scatter(x[self.iextreme_values], extreme_values, color="r") ax.hlines(chi_line, 0, n, label="%.1f%% $\chi^2$ quantile" % (100 * self.chi2_percentile), linestyles="dotted") ax.legend() ax.set_ylabel("distance squared") ax.set_xlabel("observation") ax.set_xlim(0, self.d2.shape[0]) plt.show()
def test_mcd_class_on_invalid_input(): X = np.arange(100) mcd = MinCovDet() msg = "Expected 2D array, got 1D array instead" with pytest.raises(ValueError, match=msg): mcd.fit(X)
def remove_drugs_with_low_effect_multivariate( feat, meta, signif_level=0.05, cov_estimator = 'EmpiricalCov', drugname_column = 'drug_type', dose_column = 'drug_dose', keep_names = ['DMSO', 'NoCompound'], return_nonsignificant = False ): """ Remove drugs when all the doses of the drug are very close to DMSO. Whether a dose is very close to DMSO is checked using the Mahalanobis distance (MD) calculated based on the robust covariance estimate of the DMSO observations and assuming that the MD^2 of DMSO points follow a chi2 distribution with n_feat degrees of freedom. param: feat : dataframe feature dataframe meta : dataframe dataframe with sample identification data signif_level = float Defines the significance level for the p-value of the hypothesis test for each drug dose based on the MD^2 distribution. cov_estimator : 'RobustCov' or 'EmpiricalCov' Specifies the method to estimate the covariance matrix. return_nonsignificant : bool, optional return the names of the drugs that are removed from the dataset return: feat : dataframe feature dataframe with low-potency drugs removed meta : dataframe metadata dataframe with low-potency drugs removed """ from sklearn.covariance import MinCovDet, EmpiricalCovariance from scipy.stats import chi2 from time import time if cov_estimator == 'RobustCov': estimator = MinCovDet() elif cov_estimator == 'EmpiricalCov': estimator = EmpiricalCovariance() print('Estimating covariance matrix...'); st_time=time() estimator.fit(feat[meta[drugname_column].isin(['DMSO'])]) print('Done in {:.2f}.'.format(time()-st_time)) drug_names = meta[drugname_column].unique() mah_dist = {} signif_effect_drugs = [] for idr, drug in enumerate(drug_names): if drug in keep_names: continue print('Checking compound {} ({}/{})...'.format( drug, idr+1, drug_names.shape[0])) X = feat[meta[drugname_column].isin([drug])] X.insert(0, 'dose', meta.loc[meta[drugname_column].isin([drug]), dose_column]) X = X.groupby(by='dose').mean() md2 = estimator.mahalanobis(X) mah_dist[drug] = md2 nft = feat.shape[1] # Compute the P-Values p_vals = 1 - chi2.cdf(md2, nft) # Extreme values with a significance level of p_value if any(p_vals < signif_level): signif_effect_drugs.append(drug) signif_effect_drugs.extend(keep_names) feat = feat[meta[drugname_column].isin(signif_effect_drugs)] meta = meta[meta[drugname_column].isin(signif_effect_drugs)] if return_nonsignificant: return feat, meta, list( set(drug_names).difference(set(signif_effect_drugs)) ), mah_dist else: return feat, meta
fig = plt.figure() fig.suptitle( 'Parallel Coordinates Plot of Potential Outliers in Wisconsin Breast Cancer Data' ) parallel_coordinates(d.iloc[possible_outliers, :], class_column='diagnosis', cols=d.columns[3:], color=('#0158FE', '#FE0101')) plt.show() #-------------------------------------------------------------------------------------------------# #----------------------------------------Robust Covariance----------------------------------------# #-------------------------------------------------------------------------------------------------# robust_cov = MinCovDet(assume_centered=False, random_state=14) robust_cov.fit(d.iloc[:, 3:12]) #View covariance matrix before and after reweighting sns.heatmap(robust_cov.raw_covariance_, annot=True) plt.title('Raw Covariance Matrix') plt.show() sns.heatmap(robust_cov.covariance_, annot=True) plt.title('Robust Covariance Matrix') plt.show() #View the Mahalanobis distances on the PCA plot pca_model = PCA(n_components=None, whiten=False, random_state=14) pca_dim = pca_model.fit_transform(d.iloc[:, 3:12]) plt.figure(figsize=(10, 5)) plt.xlabel('Latent Variable 1 (explains most variance)')
A = matrix(np.ones((1, N))) b = matrix(1.0) results_df = pd.DataFrame() for train_window in range(150, 1000 + 1, 10): for train_data_start in range(0, returns.shape[0] - train_window + 1, 50): train_data_end = train_data_start + train_window R_train = np.asarray(returns.iloc[train_data_start:train_data_end, :]) n = R_train.shape[0] N = R_train.shape[1] mcd = MinCovDet() mcd.fit(R_train) S = mcd.covariance_ """ Markowitz long-only """ Q = matrix(2 * S) sol = qp(Q, p, G, h, A, b) w_opt_M = np.reshape(sol["x"], N) """ Calculate statistics """ # Train cum_returns_train = np.asarray( (prices.iloc[train_data_start:train_data_end, :] - prices.iloc[train_data_start, :])[1:]) train_returns = np.dot(cum_returns_train, w_opt_M) H_train = robust_hurst(train_returns) lr.fit(
lm2 = ols('word_diff ~ Age + C(Centre_ID)', data=clean_st,subset=subset).fit() print(lm2.summary()) # <markdowncell> # # Snippets. Might come back to this later: # <codecell> from scipy.stats import pearsonr from sklearn.covariance import MinCovDet # just look at what's interesting for now, and drop the NAs involved clean = st_v_merged.loc[:,['norm_diff','Interview_Suggested_Ranking_numerical_']] clean = clean.dropna(axis=0) # calculate robust covariance estimate, calculate what's too far away mcd = MinCovDet() mcd.fit(clean) pearsonr(clean.iloc[:,0],clean.iloc[:,1]) # <codecell> d = mcd.mahalanobis(clean) d.sort() d
class MCD(): def __init__(): """ Minimum Covariance Determinant (MCD) based anomaly detection is based on that Mahalanobis-type distances in which the shape matrix is derived from a consistent high breakdown robust multivariate location and scale estimator can be used to find anomaly points. The Minimum Covariance Determinant covariance estimator is to be applied on Gaussian-distributed data, but could still be relevant on data drawn from a unimodal, symmetric distribution. Parameters ---------- store_precision : bool Specify if the estimated precision is stored. assume_centered : bool If True, the support of the robust location and the covariance estimates is computed, and a covariance estimate is recomputed from it, without centering the data. Useful to work with data whose mean is significantly equal to zero but is not exactly zero. If False, the robust location and covariance are directly computed with the FastMCD algorithm without additional treatment. support_fraction : float, 0 < support_fraction < 1 The proportion of points to be included in the support of the raw MCD estimate. Default is None, which implies that the minimum value of support_fraction will be used within the algorithm: [n_sample + n_features + 1] / 2 random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. """ def __init__(self, store_precision=True, assume_centered=False, support_fraction=None, random_state=None): self.store_precision = store_precision self.assume_centered = assume_centered self.support_fraction = support_fraction self.random_state = random_state def fit(self, X): """Fit detector. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. """ self.X_train = check_array(X) self.mcd = MinCovDet(store_precision=self.store_precision, assume_centered=self.assume_centered, support_fraction=self.support_fraction, random_state=self.random_state) self.mcd.fit(X=X, y=y) pass def decision(X): """Predict anomaly score of each element. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Returns ------- ll : array, shape (n_samples,) Mahalanobis distance of each sample under the current model, which is the anomaly score of each element. """ return self.mcd.mahalanobis(X)
class MCD(BaseDetector): """Detecting outliers in a Gaussian distributed dataset using Minimum Covariance Determinant (MCD): robust estimator of covariance. The Minimum Covariance Determinant covariance estimator is to be applied on Gaussian-distributed data, but could still be relevant on data drawn from a unimodal, symmetric distribution. It is not meant to be used with multi-modal data (the algorithm used to fit a MinCovDet object is likely to fail in such a case). One should consider projection pursuit methods to deal with multi-modal datasets. First fit a minimum covariance determinant model and then compute the Mahalanobis distance as the outlier degree of the data See :cite:`rousseeuw1999fast,hardin2004outlier` for details. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. store_precision : bool Specify if the estimated precision is stored. assume_centered : Boolean If True, the support of the robust location and the covariance estimates is computed, and a covariance estimate is recomputed from it, without centering the data. Useful to work with data whose mean is significantly equal to zero but is not exactly zero. If False, the robust location and covariance are directly computed with the FastMCD algorithm without additional treatment. support_fraction : float, 0 < support_fraction < 1 The proportion of points to be included in the support of the raw MCD estimate. Default is None, which implies that the minimum value of support_fraction will be used within the algorithm: [n_sample + n_features + 1] / 2 random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Attributes ---------- raw_location_ : array-like, shape (n_features,) The raw robust estimated location before correction and re-weighting. raw_covariance_ : array-like, shape (n_features, n_features) The raw robust estimated covariance before correction and re-weighting. raw_support_ : array-like, shape (n_samples,) A mask of the observations that have been used to compute the raw robust estimates of location and shape, before correction and re-weighting. location_ : array-like, shape (n_features,) Estimated robust location covariance_ : array-like, shape (n_features, n_features) Estimated robust covariance matrix precision_ : array-like, shape (n_features, n_features) Estimated pseudo inverse matrix. (stored only if store_precision is True) support_ : array-like, shape (n_samples,) A mask of the observations that have been used to compute the robust estimates of location and shape. decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. Mahalanobis distances of the training set (on which `:meth:`fit` is called) observations. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, contamination=0.1, store_precision=True, assume_centered=False, support_fraction=None, random_state=None): super(MCD, self).__init__(contamination=contamination) self.store_precision = store_precision self.assume_centered = assume_centered self.support_fraction = support_fraction self.random_state = random_state # noinspection PyIncorrectDocstring def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ # Validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = MinCovDet(store_precision=self.store_precision, assume_centered=self.assume_centered, support_fraction=self.support_fraction, random_state=self.random_state) self.detector_.fit(X=X, y=y) # Use mahalanabis distance as the outlier score self.decision_scores_ = self.detector_.dist_ self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) X = check_array(X) # Computer mahalanobis distance of the samples return self.detector_.mahalanobis(X) @property def raw_location_(self): """The raw robust estimated location before correction and re-weighting. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.raw_location_ @property def raw_covariance_(self): """The raw robust estimated location before correction and re-weighting. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.raw_covariance_ @property def raw_support_(self): """A mask of the observations that have been used to compute the raw robust estimates of location and shape, before correction and re-weighting. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.raw_support_ @property def location_(self): """Estimated robust location. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.location_ @property def covariance_(self): """Estimated robust covariance matrix. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.covariance_ @property def precision_(self): """ Estimated pseudo inverse matrix. (stored only if store_precision is True) Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.precision_ @property def support_(self): """A mask of the observations that have been used to compute the robust estimates of location and shape. Decorator for scikit-learn MinCovDet attributes. """ return self.detector_.support_
class Outlier_detection(object): def __init__(self, support_fraction=0.95, verbose=True, chi2_percentile=0.995): self.verbose = verbose self.support_fraction = support_fraction self.chi2 = stats.chi2 self.mcd = MCD(store_precision=True, support_fraction=support_fraction) self.chi2_percentile = chi2_percentile def fit(self, X): """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme""" self.mcd.fit(X) mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_) d = np.array(list(map(mahalanobis, X))) #Mahalanobis distance values self.d2 = d**2 #MD squared # <--- l2 norm only option?! n, self.degrees_of_freedom_ = X.shape self.iextreme_values = ( self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) ) # boolean array showing outliers self.outlier_inds = np.nonzero(od.iextreme_values)[0] # if self.verbose: print( "%.3f proportion of outliers at %.3f%% chi2 percentile, " % (self.iextreme_values.sum() / float(n), self.chi2_percentile)) print("with support fraction %.2f." % self.support_fraction) return self def plot(self, log=False, sort=False): """ log: transform the distance-sq to a log sort: sort the data according to distance before plotting """ n = self.d2.shape[0] fig = plt.figure() x = np.arange(n) ax = fig.add_subplot(111) transform = (lambda x: x) if not log else (lambda x: np.log(x)) chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) chi_line = transform(chi_line) d2 = transform(self.d2) if sort: isort = np.argsort(d2) ax.scatter(x, d2[isort], alpha=0.7, facecolors='none') plt.plot(x, transform( self.chi2.ppf(np.linspace(0, 1, n), self.degrees_of_freedom_)), c="r", label="distribution assuming normal") else: ax.scatter(x, d2) extreme_values = d2[self.iextreme_values] ax.scatter(x[self.iextreme_values], extreme_values, color="r") ax.hlines(chi_line, 0, n, label="%.1f%% $\chi^2$ quantile" % (100 * self.chi2_percentile), linestyles="dotted") ax.legend() ax.set_ylabel("distance squared") ax.set_xlabel("observation") ax.set_xlim(0, self.d2.shape[0]) plt.show() # if plot_2d: # if self.degrees_of_freedom_!=2: # print('Dataset dimensions do not allow 2D plot.') # else: # ============================================================================= # # TO DO: ADD 3D VERSION (SHOULD HAVE THIS SOMEWHERE) # ============================================================================= # ============================================================================= # # TO DO: ADD ROBUST DEMO FROM SKLEARN (AND ADAPT), SEE BELOW: # ============================================================================= """ Robust Mahalanobis distance Sources: https://en.wikipedia.org/wiki/Mahalanobis_distance http://scikit-learn.org/stable/auto_examples/covariance/plot_mahalanobis_distances.html#sphx-glr-auto-examples-covariance-plot-mahalanobis-distances-py ^^ latter uses robust estimates of mu and covariance! """ # fit a Minimum Covariance Determinant (MCD) robust estimator to data Mahal = pd.DataFrame(data=np.random(50, 2)) # add X = Mahal.values #MID_overview[['recentSales', 'CB_perc', 'HRW_perc']].values robust_cov = MinCovDet().fit( X ) # NB. robuster than standard covariance estimator EmpiricalCovariance().fit(X) Mahal['mahal_dist'] = robust_cov.mahalanobis( X - robust_cov.location_) #** (0.33) Mahal['rank_mahal'] = Mahal.mahal_dist.rank(ascending=True).astype(int)
choosen = [k for k in rm if k[0] == fname] choosen.sort( key=lambda x: datetime.datetime.strptime(x[1], "%Y%m%d-%H%M%S")) for k1 in choosen: a1, a2, a3 = extractStats(k1[2], fr) a1.extend(a2) a1.extend(a3) ab.append(a1) rr = np.array(ab) #print(dataNameList_f) if dataNameList_f: print('fitting ' + str(len(dataNameList_f)) + ' new data') mcd.fit(rr[:-1 * nrAnalysis - 1, :]) else: print('no new data') arn = mcd.mahalanobis(rr[-1 * nrAnalysis - 1:-1, :] - mcd.location_)**(0.33) aro = mcd.mahalanobis(rr[:-1 * nrAnalysis - 1, :] - mcd.location_)**(0.33) print(np.median(aro[mcd.support_])) ax1.clear() ax1.scatter(rr[:-1 * nrAnalysis - 1, [0]], rr[:-1 * nrAnalysis - 1, [3]], marker='+') ax1.scatter(rr[-1 * nrAnalysis - 1:-1, [0]], rr[-1 * nrAnalysis - 1:-1, [3]],
def reduce_cnts_based_on_coeffs(coeffs: list, cnts: list, percentile=75, plot=True) -> list: avgcoeffs = spatial_efd.AverageCoefficients(coeffs) SDcoeffs = spatial_efd.AverageSD(coeffs, avgcoeffs) if plot: median = np.median(np.array(coeffs), axis=0) x_med, y_med = spatial_efd.inverse_transform(median, harmonic=10) x_avg, y_avg = spatial_efd.inverse_transform(avgcoeffs, harmonic=10) x_sd, y_sd = spatial_efd.inverse_transform(SDcoeffs, harmonic=10) ax = spatial_efd.InitPlot() spatial_efd.PlotEllipse(ax, x_avg, y_avg, color="w", width=2.0) spatial_efd.PlotEllipse(ax, x_med, y_med, color="b", width=2.0) # Plot avg +/- 1 SD error ellipses spatial_efd.PlotEllipse(ax, x_avg + x_sd, y_avg + y_sd, color="r", width=1.0) spatial_efd.PlotEllipse(ax, x_avg - x_sd, y_avg - y_sd, color="r", width=1.0) plt.close("all") arr = np.array(coeffs) reshaped = np.reshape(arr, [arr.shape[0], -1]) MCD = MinCovDet() MCD.fit(reshaped) a = MCD.mahalanobis(reshaped) if plot: plt.boxplot(a) plt.show() plt.close("all") percentile = np.percentile(a, percentile) reduced = list(np.array(coeffs)[a < percentile]) avgcoeffs = spatial_efd.AverageCoefficients(reduced) SDcoeffs = spatial_efd.AverageSD(reduced, avgcoeffs) median = np.median(np.array(reduced), axis=0) x_med, y_med = spatial_efd.inverse_transform(median, harmonic=10) x_avg, y_avg = spatial_efd.inverse_transform(avgcoeffs, harmonic=10) x_sd, y_sd = spatial_efd.inverse_transform(0.1 * SDcoeffs, harmonic=10) if plot: ax = spatial_efd.InitPlot() spatial_efd.PlotEllipse(ax, x_avg, y_avg, color="w", width=2.0) spatial_efd.PlotEllipse(ax, x_med, y_med, color="b", width=2.0) # Plot avg +/- 1 SD error ellipses spatial_efd.PlotEllipse(ax, x_avg + x_sd, y_avg + y_sd, color="r", width=1.0) spatial_efd.PlotEllipse(ax, x_avg - x_sd, y_avg - y_sd, color="r", width=1.0) i = 10 plt.figure() ax = plt.gca() spatial_efd.plotComparison( ax, coeffs[i], 10, cnts[i][:, 0], cnts[i][:, 1], color1="w", rotation=rots[i], ) plt.show() plt.close("all") reduced_cnts = np.array(cnts)[a < percentile] return reduced_cnts, a < percentile
raw_slopes_ok_subjs = slopes_df[ok_subjects] control_data = raw_slopes_ok_subjs[raw_slopes_ok_subjs['group'] == 'control'] control_slopes = control_data[task_names] preHD_data = raw_slopes_ok_subjs[raw_slopes_ok_subjs['group'] == 'preHD'] preHD_slopes = preHD_data[task_names] ''' PCA Representation of raw slopes ''' all_slopes = raw_slopes_ok_subjs[task_names] rs = RobustScaler() scaled_all_slopes = (1.34896) * rs.fit_transform(all_slopes) mcd = MinCovDet() #random_state=1982) mcd.fit(scaled_all_slopes) all_slopes_corr = corr_from_cov(mcd.covariance_) plot_corr_matrix(all_slopes_corr, col_names=task_names, title='All corr, raw') all_pcs, all_var_explained = PCs_from_cov(mcd.covariance_, task_names, n_components=n_components, convert_2_corr=True) # Properly centered DataFrame of scaled slopes stds = pd.Series(dict(zip(task_names, np.sqrt(mcd.covariance_.diagonal())))) rn_slopes = pd.DataFrame(dict(zip(task_names, scaled_all_slopes.T))).\ set_index(all_slopes.index) / stds -\
def DetectOutliers(sc, cluster_label, red_dim = 2, outlier_prob_thres = 10**-4): """ This function implements the outlier detection scheme of FEATS. Parameters ---------- sc : SingleCell The SingleCell object which contains the data and metadata of genes and cells cluster_label : str The name of the column in celldata assay of sc which stores the cluster labels of the cells red_dim : int, optional The reduced dimentionality in which the outliers are computed. Default 2. outlier_prob_thres : float The probability threshold for samples to be classified as outliers. Default 10^-4. Returns ------- SingleCell The single cell object containing the outlier analysis information in the celldata assay. It contains the following columns in the celldata assay with the outlier information: 'FEATS_Outliers' - A column with the value True if the respective cell is an outlier, False otherwise. 'FEATS_Msd' - The computed Mahalanobis squared distance for the respective cells. 'FEATS_Outlier_Score' - The outlier score for the respective cells. 'FEATS_Oos' - A column with the value True if the respective cell was not used by the Minimum Covariance Determinant (MCD) algorithm in computing the robust mean and covariance matrix. """ # Store outlier probability in sc object sc.addCellData(col_data = -np.log10(np.ones(sc.dim[1]) * outlier_prob_thres), col_name = 'Outlier_Thres') # First check if clustering has been performed if (sc.checkCellData(cluster_label) == False): raise ValueError("Clustering has not been done. Perform clustering first! ") else: print("Computing outliers . . .") # Get cluster labels labels = sc.getCellData(cluster_label) n_clusters = np.unique(labels) X = sc.getCounts() _, n_samples = X.shape # Sort according to F scores scores = sc.getGeneData('FEATS_F_Score') idx = np.argsort(scores, kind='mergesort') idx = idx[::-1] # Sort descending # X = X[idx[0:100], :] # PCA pc = PCA(n_components=red_dim) X_red = pc.fit_transform(X.T) X_red = X_red.T mcd = MinCovDet(assume_centered=True) #mcd = [] #for i in range(n_clusters): # mcd.append(MinCovDet(assume_centered=True)) # mcd object, to compute min cov determinant oos = np.zeros(n_samples, dtype=bool) # Out of sample estimates (bool), True if sample is not included # in MCD computation squared_md = np.zeros(n_samples) # Squared Mahalanobis Distance # For each cluster reduce the data and estimate the robust covariance for i in n_clusters: mask = (labels == i) # If number of samples is less than number of features in reduced data squared. if (np.sum(mask) < red_dim**2): print("Number of samples is less than number of features squared.") print("Not performing outlier detection on cluster ", i) oos[mask] = False # Set the samples as not an outlier squared_md[mask] = 0.0 # Set the mahalanobis distance as zero. else: cluster = X_red[:, mask] mcd.fit(cluster.T) # Fit a minimum covariance determinant estimator # cluster_mu = mcd.location_ # cluster_cov = mcd.covariance_ squared_md[mask] = mcd.mahalanobis(cluster.T) oos[mask] = (mcd.support_ == False) outlier_score = -np.log10(chi2.sf(squared_md, red_dim)) outliers = outlier_score > -np.log10(outlier_prob_thres) print ("Number of outliers = ", np.sum(outliers)) print ("Number of points in out of sample = ", np.sum(oos)) print("Saving outlier information in Single Cell object . . .") sc.addCellData(col_data = outliers, col_name = "FEATS_Outliers") sc.addCellData(col_data = squared_md, col_name = "FEATS_Msd") sc.addCellData(col_data = outlier_score, col_name = "FEATS_Outlier_Score") sc.addCellData(col_data = oos, col_name = "FEATS_Oos") return sc