def test_EmpiricalCovariance_validates_mahalanobis(): """Checks that EmpiricalCovariance validates data with mahalanobis.""" cov = EmpiricalCovariance().fit(X) msg = f"X has 2 features, but \\w+ is expecting {X.shape[1]} features as input" with pytest.raises(ValueError, match=msg): cov.mahalanobis(X[:, :2])
def plot_contours(self, ax, show=False): COV = self.emp_cov COV_slice = EmpiricalCovariance() COV_slice.location_ = np.array([ COV.location_[0], COV.location_[1] ]) COV_slice.covariance_ = np.array([ COV.covariance_[ 0,0 ], COV.covariance_[ 0,1 ], COV.covariance_[ 1,0 ], COV.covariance_[ 1,1 ] ]) COV_slice.covariance_ = COV_slice.covariance_.reshape((2,2)) COV_slice.precision_ = np.array([ COV.precision_[ 0,0 ], COV.precision_[ 0,1 ], COV.precision_[ 1,0 ], COV.precision_[ 1,1 ] ]) COV_slice.precision_ = COV_slice.precision_.reshape((2,2)) # Show contours of the distance functions xx, yy = np.meshgrid( np.linspace(COV_slice.location_[0]-5*math.sqrt(COV_slice.covariance_[0,0]), COV_slice.location_[0]+5*math.sqrt(COV_slice.covariance_[0,0]), 100), np.linspace(COV_slice.location_[1]-5*math.sqrt(COV_slice.covariance_[1,1]), COV_slice.location_[1]+5*math.sqrt(COV_slice.covariance_[1,1]), 100), ) zz = np.c_[xx.ravel(), yy.ravel()] # Empirical fit is not so good. Don't plot this if False: # keep for debugging mahal_emp_cov = COV_slice.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = ax.contour(xx, yy, np.sqrt(mahal_emp_cov), levels=[1.,2.,3.,4.,5.], #cmap=plt.cm.PuBu_r, cmap=plt.cm.cool_r, linestyles='dashed') COV = self.rob_cov COV_slice = EmpiricalCovariance() COV_slice.location_ = np.array([ COV.location_[0], COV.location_[1] ]) COV_slice.covariance_ = np.array([ COV.covariance_[ 0,0 ], COV.covariance_[ 0,1 ], COV.covariance_[ 1,0 ], COV.covariance_[ 1,1 ] ]) COV_slice.covariance_ = COV_slice.covariance_.reshape((2,2)) COV_slice.precision_ = np.array([ COV.precision_[ 0,0 ], COV.precision_[ 0,1 ], COV.precision_[ 1,0 ], COV.precision_[ 1,1 ] ]) COV_slice.precision_ = COV_slice.precision_.reshape((2,2)) self.robust_model_XY = COV_slice # robust is better if show: mahal_robust_cov = COV_slice.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = ax.contour(xx, yy, np.sqrt(mahal_robust_cov), levels=[1.,2.,3.,4.,5.], #cmap=plt.cm.YlOrBr_r, cmap=plt.cm.spring_r, linestyles='dotted')
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='spectral'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='frobenius'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), scaling=False), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), squared=False), 0) # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert(np.amax(mahal_dist) < 250) assert(np.amin(mahal_dist) > 50) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result)
class OneClassMahalanobis(BaseClassifier): _fit_params = ['perc_keep'] _predict_params = [] def __init__(self,*args, **kwargs): # BaseClassifier.__init__(self, *args, **kwargs) self.perc_keep = kwargs["perc_keep"] def fit(self, data): nu = 0.01 n_sample = data.shape[0] n_feature = data.shape[1] exclude = set() for d in range(n_feature): feature = data[:, d] s_feature = feature.copy() s_feature.sort() low = s_feature[int(n_sample*nu/2)] upp = s_feature[n_sample-int(n_sample*nu/2)] exld = numpy.nonzero(numpy.logical_or((feature > upp),(feature < low)))[0] [exclude.add(e) for e in exld] use = numpy.array([f for f in range(n_sample) if f not in exclude]) data_ = data[use, :] self.cov = EmpiricalCovariance().fit(data_) dist = self.cov.mahalanobis(data) self.cutoff = numpy.percentile(dist, self.perc_keep) print self.cutoff def predict(self, data): mahal_dist = self.cov.mahalanobis(data) self.mahal_dist = mahal_dist print mahal_dist.min(), mahal_dist.max(), self.cutoff, (mahal_dist > self.cutoff).sum(), "of", len(mahal_dist) return (mahal_dist > self.cutoff).astype(numpy.uint8)*-2+1 def decision_function(self, data=None): return self.mahal_dist
def detectOutlier(X): X = np.transpose(X) outlierVec = [] # fit a Minimum Covariance Determinant (MCD) robust estimator to data #robust_cov = MinCovDet().fit(X) robust_cov = EmpiricalCovariance().fit(X) outlierVec = robust_cov.mahalanobis(X) #proj return np.sqrt(outlierVec)
def mahalanobisDistances(dm): reduced_data = PCA(n_components=2).fit_transform(dm) robust_cov = MinCovDet().fit(reduced_data) emp_cov = EmpiricalCovariance().fit(reduced_data) fig = plt.figure() plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) subfig1 = plt.subplot(3, 1, 1) inlier_plot = subfig1.scatter(reduced_data[:, 0], reduced_data[:, 1], color='black', label='inliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') plt.xticks(()) plt.yticks(()) # Plot the scores for each point emp_mahal = emp_cov.mahalanobis(reduced_data - np.mean(reduced_data, 0)) ** (0.33) subfig2 = plt.subplot(2, 2, 3) plt.yticks(()) robust_mahal = robust_cov.mahalanobis(reduced_data - robust_cov.location_) ** (0.33) subfig3 = plt.subplot(2, 2, 4) plt.yticks(()) plt.show()
def mahal_plot(e): first_half = e[1:len(e) - 1] second_half = e[2:len(e)] X = np.array([first_half, second_half]) X = np.transpose(X) # fit a Minimum Covariance Determinant (MCD) robust estimator to data robust_cov = MinCovDet().fit(X) # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) fig = plt.figure() # Show data set subfig1 = plt.subplot(1, 1, 1) inlier_plot = subfig1.scatter(first_half, second_half, color='black', label='daily diff in homes passed') subfig1.set_title("Mahalanobis distances of the iid invariants:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 800), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, color='red', linewidth="3") subfig1.legend([ emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot ], ['MLE dist', 'robust dist', 'kpis'], loc="upper right", borderaxespad=0) print(np.corrcoef(first_half, second_half)) return (robust_cov, emp_cov)
class Mahalanobis(BaseEstimator): """Mahalanobis distance estimator. Uses Covariance estimate to compute mahalanobis distance of the observations from the model. Parameters ---------- robust : boolean to determine wheter to use robust estimator based on Minimum Covariance Determinant computation """ def __init__(self, robust=False): if not robust: from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator # else: from sklearn.covariance import MinCovDet as CovarianceEstimator # self.model = CovarianceEstimator() self.cov = None def fit(self, X, y=None, **params): """Fits the covariance model according to the given training data and parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self : object Returns self. """ self.cov = self.model.fit(X) return self def score(self, X, y=None): """Computes the mahalanobis distances of given observations. The provided observations are assumed to be centered. One may want to center them using a location estimate first. Parameters ---------- X : array-like, shape = [n_samples, n_features] The observations, the Mahalanobis distances of the which we compute. Returns ------- mahalanobis_distance : array, shape = [n_observations,] Mahalanobis distances of the observations. """ #return self.model.score(X,assume_centered=True) return -self.model.mahalanobis(X - self.model.location_)**0.33
class Mahalanobis (BaseEstimator): """Mahalanobis distance estimator. Uses Covariance estimate to compute mahalanobis distance of the observations from the model. Parameters ---------- robust : boolean to determine wheter to use robust estimator based on Minimum Covariance Determinant computation """ def __init__(self, robust=False): if not robust: from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator # else: from sklearn.covariance import MinCovDet as CovarianceEstimator # self.model = CovarianceEstimator() self.cov = None def fit(self, X, y=None, **params): """Fits the covariance model according to the given training data and parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self : object Returns self. """ self.cov = self.model.fit(X) return self def score(self, X, y=None): """Computes the mahalanobis distances of given observations. The provided observations are assumed to be centered. One may want to center them using a location estimate first. Parameters ---------- X : array-like, shape = [n_samples, n_features] The observations, the Mahalanobis distances of the which we compute. Returns ------- mahalanobis_distance : array, shape = [n_observations,] Mahalanobis distances of the observations. """ #return self.model.score(X,assume_centered=True) return - self.model.mahalanobis(X-self.model.location_) ** 0.33
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal( cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal( cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print(np.amin(mahal_dist), np.amax(mahal_dist)) assert(np.amin(mahal_dist) > 0) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample # FIXME I don't know what this test does X_1sample = np.arange(5) cov = EmpiricalCovariance() assert_warns(UserWarning, cov.fit, X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def get_dose_dist_from_control(self, control, metric='euclidean'): mean_control = np.mean(control, axis=0).reshape(1, -1) mean_dose = self.get_mean_doses() if 'eucl' in metric: from sklearn.metrics import euclidean_distances dist = euclidean_distances(mean_control, mean_dose) elif 'mahalan' in metric: from sklearn.covariance import EmpiricalCovariance cov = EmpiricalCovariance().fit(control) dist = cov.mahalanobis(mean_dose) return dist
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False): """ See http://scikit-learn.org/0.13/modules/outlier_detection.html#\ fitting-an-elliptic-envelop for details. """ if df is None and ctry is None: raise ValueError('Either the country or a dataframe must be supplied') elif df is None: df = load_res(ctry, weighted=weighted) if inliers: df = get_inliers(df=df) X = df.values robust_cov = MinCovDet().fit(X) #----------------------------------------------------------------------------- # compare estimators learnt from the full data set with true parameters emp_cov = EmpiricalCovariance().fit(X) #----------------------------------------------------------------------------- # Display results fig = plt.figure() fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05) #----------------------------------------------------------------------------- # Show data set ax1 = fig.add_subplot(1, 1, 1) ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.') ax1.set_title(country_code[ctry]) #----------------------------------------------------------------------------- # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1], 100), np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] #----------------------------------------------------------------------------- mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') #----------------------------------------------------------------------------- mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]], ['MLE dist', 'robust dist'], loc="upper right", borderaxespad=0) ax1.grid() return (fig, ax1, ctry)
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal( cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal( cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print np.amin(mahal_dist), np.amax(mahal_dist) assert(np.amin(mahal_dist) > 0) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample X_1sample = np.arange(5) cov = EmpiricalCovariance() with warnings.catch_warnings(record=True): cov.fit(X_1sample) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_equal(cov.location_, np.zeros(X.shape[1]))
def outlier_rejection(feat, prob): ''' ''' from sklearn.covariance import EmpiricalCovariance #MinCovDet #real_cov #linalg.inv(real_cov) #robust_cov = MinCovDet().fit(feat) robust_cov = EmpiricalCovariance().fit(feat) dist = robust_cov.mahalanobis(feat - numpy.median(feat, 0)) cut = scipy.stats.chi2.ppf(prob, feat.shape[1]) return dist < cut
def test_covariance(): # Tests Covariance module on a simple dataset. # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) with pytest.raises(NotImplementedError): cov.error_norm(emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert np.amin(mahal_dist) > 0 # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample # Create X with 1 sample and 5 features X_1sample = np.arange(5).reshape(1, 5) cov = EmpiricalCovariance() warn_msg = ( "Only one sample available. You may want to reshape your data array") with pytest.warns(UserWarning, match=warn_msg): cov.fit(X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def main(): print ("Running CV on Mahalanobis Distance based approach.") mahanalobis() start_time = time.time() totalX = [] totalY = [] flag = True countTrain = 228000 print ("\n\nNow testing on separate data.") with open("creditcard.csv", "rb") as f: data = csv.reader(f) for row in data: if flag: flag = False continue countTrain += 1 if countTrain > 228000: #CV on 80% of data totalX.append([float(i) for i in row[:-1]]) totalY.append(int(row[-1])) print ("Data Loaded") totalX = scalar.fit_transform(totalX) clf = EmpiricalCovariance() clf.fit(totalX) distances = clf.mahalanobis(totalX) Y = [] for i in range(len(totalY)): if np.log10(distances[i]) > 1.838: Y.append(1) else: Y.append(0) print("%s seconds" % (time.time() - start_time)) print ("Results") auc = roc_auc_score(totalY, Y) print("Area under curve : " + str(auc)) fpr, tpr, _ = roc_curve(totalY, Y) print ("False Positive Rate : " + str(fpr[1])) _, recall, _ = precision_recall_curve(totalY, Y) print ("Recall : " + str(recall[1])) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % auc) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc="lower right") plt.show()
class ChangeDetector(object): """ Joint Gaussian Change detector using a scikit learn style interface This class is really a wrapper around the methods in scikit learn for estimating covariance using robust or empirical methods and calculating the mahalanobis distances. """ def __init__(self, method='robust', estimator_kw_args={}): if method is 'robust': self.covariance_estimator_ = MinCovDet(**estimator_kw_args) elif method is 'empirical': self.covariance_estimator_ = EmpiricalCovariance( **estimator_kw_args) else: raise ValueError( "{} is not a valid method. Must be one of 'robust' or 'empirical'" .format(method)) def fit(self, X): """ Fits the estimator. Parameters: ----------- X - array of time series, shape (n_series, len_series) """ self.covariance_estimator_ = self.covariance_estimator_.fit(X) return self def predict(self, X, threshold): """ Returns true for each time series predicted as change. Also returns the mahalanobis distances parameters: ----------- X - array of time series, shape (n_series, len_series) threshold - float returns: y_pred - shape (n_time_series), true of change detected distances - shape (n_time_series). The mahanobis distances of each time series under the fitted distribution """ distances = self.covariance_estimator_.mahalanobis(X) return distances > threshold, distances
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0) assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm="foo") # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print(np.amin(mahal_dist), np.amax(mahal_dist)) assert np.amin(mahal_dist) > 0 # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0) # test with one sample X_1sample = np.arange(5) cov = EmpiricalCovariance() with warnings.catch_warnings(record=True): cov.fit(X_1sample) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def deneAna(dene,plot=False,output="dene.png"): X=np.array([sim[2:] for sim in dene]) Xmean=np.mean(X,axis=0) Xtocov=scale(X, axis=0, with_mean=True, with_std=False, copy=True) emp_cov = EmpiricalCovariance(assume_centered=True).fit(Xtocov) mahal_dist = emp_cov.mahalanobis(Xtocov) max_dist=max(mahal_dist) if plot: chi2range=[5.991465,max_dist] outliers=[] #print "outliers in dene: ", for i in range(0,len(mahal_dist)): if mahal_dist[i]>chi2range[0]: outliers.append(i) plotdeneana(Xscaled,Xmean,emp_cov,chi2range,intnames,outliers,output,a,b,lc="blue") return emp_cov,Xmean, max_dist
class OneClassMahalanobis(BaseClassifier): _fit_params = [] def __init__(self, *args, **kwargs): pass def fit(self, data): #self.cov = MinCovDet().fit(data) self.cov = EmpiricalCovariance().fit(data) def predict(self, data): mahal_emp_cov = self.cov.mahalanobis(data) d = data.shape[1] thres = scipy.stats.chi2.ppf(0.95, d) self.mahal_emp_cov = mahal_emp_cov return (mahal_emp_cov > thres).astype(numpy.int32)*-2+1 def decision_function(self, data): return self.mahal_emp_cov
class MahalanobisEstimator: """ Store location and dispersion estimators of the empirical distribution of data provided in an array and allow computation of statistical distances. Parameters ---------- arr : {pandas.DataFrame, np.ndarray} the matrix used to calculate covariance Attributes ---------- sigma : np.array Fitted covariance matrix of sklearn.covariance.EmpiricalCovariance() Methods ------- mahalanobis(X) Computes mahalanobis distance between the input array (self.arr) and the X array as provided """ def __init__(self, arr: Union[pd.DataFrame, np.ndarray]): self.sigma = EmpiricalCovariance().fit(arr) def mahalanobis(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: """Compute the mahalanobis distance between the empirical distribution described by this object and points in an array `X`. Parameters ---------- X : {pandas.DataFrame, np.ndarray} A samples by features array-like matrix to compute mahalanobis distance between self.arr Returns ------- numpy.array Mahalanobis distance between the input array and the original sigma """ return self.sigma.mahalanobis(X)
def fit(self, X, y=None): scaler_norm = Normalizer(norm=self.norm).fit(X) df_all_norm = scaler_norm.transform(X).astype(float) df_all_norm = pd.DataFrame(data=np.hstack( (df_all_norm, y.reshape((-1, 1)))), columns=self.colNames) X_SCALED = df_all_norm.iloc[:, :-1].values robust_cov_all = EmpiricalCovariance().fit(X_SCALED[:, :]) robust_mahal_all = robust_cov_all.mahalanobis( X_SCALED[:, :] - robust_cov_all.location_)**(0.33) #ALT:2 rm = pd.DataFrame(robust_mahal_all, columns=["value"]) iqr = float(rm["value"].quantile(0.75)) - float( rm["value"].quantile(0.25)) outlierRatioRob_all_1 = rm["value"].quantile(0.75) + (1.5 * iqr) outlierRatioRob_all_2 = rm["value"].quantile(0.25) - (1.5 * iqr) print iqr, rm["value"].quantile(0.25), rm["value"].quantile(0.75) print "Ouliers min ratio:", outlierRatioRob_all_1, outlierRatioRob_all_2 print "Num outliers detected:", len( X_SCALED[robust_mahal_all > outlierRatioRob_all_1, 0]) print "Num outliers detected:", len( X_SCALED[robust_mahal_all < outlierRatioRob_all_2, 0]) print[(self.codes[r], robust_mahal_all[r]) for r in range(len(robust_mahal_all))] patients_out = self.codes[robust_mahal_all > outlierRatioRob_all_1] self.codesToDel.append(patients_out) print "Patients outliers above: {}".format(patients_out) patients_out = self.codes[robust_mahal_all < outlierRatioRob_all_2] self.codesToDel.append(patients_out) print "Patients outliers below: {}".format(patients_out) return self
class MahalanobisDistance(DimReducer): """ Computes a person's Mahalanobis distance using the mean and covariance estimated from a set of young people. Uses sklearn; verified this matches up with the normal matrix computation. """ def __init__(self, age_lower, age_upper): self.age_lower = age_lower self.age_upper = age_upper self.need_ages = True self.k = 1 def _fit_from_processed_data(self, X, ages): young_people = (ages >= self.age_lower) & (ages <= self.age_upper) print("%i people between %s and %s used for mean/cov calculation" % (young_people.sum(), self.age_lower, self.age_upper)) assert young_people.sum() > 1000 self.model = EmpiricalCovariance(assume_centered=False) self.model.fit(X[young_people, :]) def _get_projections_from_processed_data(self, X): md = np.sqrt(self.model.mahalanobis(X)).reshape([-1, 1]) return md
def mahanalobis(): totalX = [] totalY = [] flag = True countTrain = 0 with open("creditcard.csv", "rb") as f: data = csv.reader(f) for row in data: if flag: flag = False continue if countTrain >= 228000: #test on 20% of data break countTrain += 1 totalX.append([float(i) for i in row[:-1]]) totalY.append(int(row[-1])) totalX = scalar.fit_transform(totalX) print ("Data Loaded") clf = EmpiricalCovariance() clf.fit(totalX) distances = clf.mahalanobis(totalX) Y = [] for i in range(len(totalY)): if np.log10(distances[i]) > 1.838: Y.append(1) else: Y.append(0) print ("Results") auc = roc_auc_score(totalY, Y) print(auc) fpr, _, _ = roc_curve(totalY, Y) print (fpr[1]) _, recall, _ = precision_recall_curve(totalY, Y) print (recall[1]) return auc, fpr[1], recall[1]
def main(): parser = argparse.ArgumentParser( description='Plot outlier-like distances for a 2-dimensional dataset') parser.add_argument('dataset', type=argparse.FileType('r'), help='a CSV file containing the dataset') parser.add_argument( '--plot', type=str, choices=['train', 'grid'], default='grid', help='plot the dataset or a grid evenly distributed over its span') parser.add_argument('--plotdims', type=int, choices=[2, 3], default=2, help='the number of dimensions to plot') args = parser.parse_args() X = np.loadtxt(args.dataset, delimiter=',') fig = plt.figure() xformer = NullTransformer() if X.shape[1] > 2: xformer = PCA(n_components=2) X = xformer.fit_transform(X) if args.plotdims == 2: plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0') else: plt.scatter(X[:, 0], X[:, 1]) plt.show(block=False) path_to_script = os.path.realpath(__file__) dir_of_script = os.path.dirname(path_to_script) dataset_path = dir_of_script + '/outliers.npy' np.save(dataset_path, X) ########################################################################### # Train autoencoder with the n samples until convergence. Run # evenly distributed samples through the autoencoder and compute # their reconstruction error. ########################################################################### maxseq_orig = np.max(X) minseq_orig = np.min(X) seqrange = np.abs(maxseq_orig - minseq_orig) maxseq = maxseq_orig + 0.5 * seqrange minseq = minseq_orig - 0.5 * seqrange print("minseq", minseq, "maxseq", maxseq) if args.plot == 'grid': seq = np.linspace(minseq, maxseq, num=50, endpoint=True) Xplot = np.array([_ for _ in product(seq, seq)]) else: Xplot = X robust_cov = MinCovDet().fit(X) robust_md = robust_cov.mahalanobis(Xplot) empirical_cov = EmpiricalCovariance().fit(X) empirical_md = empirical_cov.mahalanobis(Xplot) # Assume Xplot is at least 2-dimensional. if Xplot.shape[1] > 2: Xplot2d = bh_sne(Xplot) else: Xplot2d = Xplot robust_md01 = robust_md - np.nanmin(robust_md) robust_md01 = robust_md01 / np.nanmax(robust_md01) empirical_md01 = empirical_md - np.nanmin(empirical_md) empirical_md01 = empirical_md01 / np.nanmax(empirical_md01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01, cmap=plt.cm.jet, color=robust_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (robust covariance)') fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01, cmap=plt.cm.jet, color=empirical_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (empirical covariance)') enc_dec = [ # tanh encoder, linear decoder ['tanh', 'linear'], # sigmoid encoder, linear decoder ['sigmoid', 'linear'], ####################################################################### # The reconstruction error of the autoencoders trained with the # remaining commented-out pairs don't seem to match Mahalanobis # distance very well. Feel free to uncomment them to see for # yourself. # linear encoder, linear decoder # ['linear', 'linear'], # tanh encoder, tanh decoder # ['tanh', 'tanh'], # tanh encoder, sigmoid decoder # ['tanh', 'sigmoid'], # sigmoid encoder, tanh decoder # ['sigmoid', 'tanh'], # sigmoid encoder, sigmoid decoder # ['sigmoid', 'sigmoid'] ####################################################################### ] for i, act in enumerate(enc_dec): enc, dec = act if dec == 'linear': dec = None model = train_autoencoder(dataset_path, act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16) Xshared = theano.shared(np.asarray(Xplot, dtype=theano.config.floatX), borrow=True) f = theano.function([], outputs=model.reconstruct(Xshared)) fit = f() error = reconstruction_error(Xplot, fit) error01 = error - np.nanmin(error) error01 = error01 / np.nanmax(error01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=error, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error, cmap=plt.cm.jet, color=error01) ax.set_zlabel('Reconstruction error') ax.set_xlabel('x') ax.set_ylabel('y') encdec_type = ', '.join(act) ax.set_title('Reconstruction error (' + encdec_type + ')') print("Correlation of robust MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(robust_md, error))) print("Correlation of empirical MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(empirical_md, error))) print("Correlation of robust MD and empirical MD " + str(pearsonr(robust_md, empirical_md))) os.remove(dataset_path) os.remove('outliers.pkl') plt.show(block=True)
fig = plt.figure() # Show data set subfig1 = plt.subplot(1, 1, 1) inlier_plot = subfig1.scatter(first_half, second_half, color='black', label='daily diff in homes passed') subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 800), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot], ['MLE dist', 'robust dist', 'kpis'], loc="upper right", borderaxespad=0) print(np.corrcoef(first_half,second_half))
def main(): parser = argparse.ArgumentParser( description='Plot outlier-like distances for a 2-dimensional dataset') parser.add_argument( 'dataset', type=argparse.FileType('r'), help='a CSV file containing the dataset') parser.add_argument( '--plot', type=str, choices=['train', 'grid'], default='grid', help='plot the dataset or a grid evenly distributed over its span') parser.add_argument( '--plotdims', type=int, choices=[2, 3], default=2, help='the number of dimensions to plot') args = parser.parse_args() X = np.loadtxt(args.dataset, delimiter=',') fig = plt.figure() xformer = NullTransformer() if X.shape[1] > 2: xformer = PCA(n_components=2) X = xformer.fit_transform(X) if args.plotdims == 2: plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0') else: plt.scatter(X[:, 0], X[:, 1]) plt.show(block=False) path_to_script = os.path.realpath(__file__) dir_of_script = os.path.dirname(path_to_script) dataset_path = dir_of_script + '/outliers.npy' np.save(dataset_path, X) ########################################################################### # Train autoencoder with the n samples until convergence. Run # evenly distributed samples through the autoencoder and compute # their reconstruction error. ########################################################################### maxseq_orig = np.max(X) minseq_orig = np.min(X) seqrange = np.abs(maxseq_orig - minseq_orig) maxseq = maxseq_orig + 0.5 * seqrange minseq = minseq_orig - 0.5 * seqrange print("minseq", minseq, "maxseq", maxseq) if args.plot == 'grid': seq = np.linspace(minseq, maxseq, num=50, endpoint=True) Xplot = np.array([_ for _ in product(seq, seq)]) else: Xplot = X robust_cov = MinCovDet().fit(X) robust_md = robust_cov.mahalanobis(Xplot) empirical_cov = EmpiricalCovariance().fit(X) empirical_md = empirical_cov.mahalanobis(Xplot) # Assume Xplot is at least 2-dimensional. if Xplot.shape[1] > 2: Xplot2d = bh_sne(Xplot) else: Xplot2d = Xplot robust_md01 = robust_md - np.nanmin(robust_md) robust_md01 = robust_md01 / np.nanmax(robust_md01) empirical_md01 = empirical_md - np.nanmin(empirical_md) empirical_md01 = empirical_md01 / np.nanmax(empirical_md01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01, cmap=plt.cm.jet, color=robust_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (robust covariance)') fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01, cmap=plt.cm.jet, color=empirical_md01) ax.set_zlabel('Mahalanobis distance') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title('Mahalanobis distance (empirical covariance)') enc_dec = [ # tanh encoder, linear decoder ['tanh', 'linear'], # sigmoid encoder, linear decoder ['sigmoid', 'linear'], ####################################################################### # The reconstruction error of the autoencoders trained with the # remaining commented-out pairs don't seem to match Mahalanobis # distance very well. Feel free to uncomment them to see for # yourself. # linear encoder, linear decoder # ['linear', 'linear'], # tanh encoder, tanh decoder # ['tanh', 'tanh'], # tanh encoder, sigmoid decoder # ['tanh', 'sigmoid'], # sigmoid encoder, tanh decoder # ['sigmoid', 'tanh'], # sigmoid encoder, sigmoid decoder # ['sigmoid', 'sigmoid'] ####################################################################### ] for i, act in enumerate(enc_dec): enc, dec = act if dec == 'linear': dec = None model = train_autoencoder(dataset_path, act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16) Xshared = theano.shared( np.asarray(Xplot, dtype=theano.config.floatX), borrow=True) f = theano.function([], outputs=model.reconstruct(Xshared)) fit = f() error = reconstruction_error(Xplot, fit) error01 = error - np.nanmin(error) error01 = error01 / np.nanmax(error01) fig = plt.figure() if args.plotdims == 2: ax = fig.add_subplot(1, 1, 1) ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], cmap=plt.cm.jet, c=error, s=60, linewidth='0') else: ax = fig.add_subplot(1, 1, 1, projection='3d') ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error, cmap=plt.cm.jet, color=error01) ax.set_zlabel('Reconstruction error') ax.set_xlabel('x') ax.set_ylabel('y') encdec_type = ', '.join(act) ax.set_title('Reconstruction error (' + encdec_type + ')') print("Correlation of robust MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(robust_md, error))) print("Correlation of empirical MD and reconstruction error (" + str(encdec_type) + ") " + str(pearsonr(empirical_md, error))) print("Correlation of robust MD and empirical MD " + str(pearsonr(robust_md, empirical_md))) os.remove(dataset_path) os.remove('outliers.pkl') plt.show(block=True)
import matplotlib.cm as cm # Import data data = pd.read_excel('C:/Users/dorta/Dropbox/Stanford/GS 240/Homeworks/Hmk4//ilr_data.xls') ilr_cols = ['ilr'+str(x) for x in range(1,30)] data_ilr = data.loc[:,ilr_cols] # -------------------------- Outlier Detection -------------------------- # Fit the covariances robust_cov = MinCovDet().fit(data_ilr) emp_cov = EmpiricalCovariance().fit(data_ilr) # Get the Mahalanobis distances robust_dist = np.sqrt(robust_cov.mahalanobis(data_ilr)) classic_dist = np.sqrt(emp_cov.mahalanobis(data_ilr)) # Chi squared test at p=0.025 thresh = np.sqrt(chi2.isf(0.025, len(ilr_cols))) # Plot of the outliers fig = plt.figure() ax1 = fig.add_subplot(111) ax1.scatter(classic_dist[robust_dist<thresh], robust_dist[robust_dist<thresh], s=7, c='c', marker="+", label='inliers') ax1.scatter(classic_dist[robust_dist>thresh], robust_dist[robust_dist>thresh], s=7, c='r', marker="+", label='outliers') x = np.linspace(*ax1.get_xlim()) ax1.plot(x, x, linewidth=1, linestyle='--', color='b') ax1.plot([0, 20], [thresh, thresh], linewidth=0.5, linestyle='--', color='r') ax1.plot([thresh, thresh], [0, 40], linewidth=0.5, linestyle='--', color='r') plt.legend(loc='upper left')
# save for heuristic correction age = df_test['var15'] age_ecdf = ECDF(df_train['var15']) df_train['var15'] = age_ecdf(df_train['var15']) df_test['var15'] = age_ecdf(df_test['var15']) # feature engineering df_train.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0 df_train['num_zeros'] = (df_train == 0).sum(axis=1) df_test.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0 df_test['num_zeros'] = (df_test == 0).sum(axis=1) # outliers ec = EmpiricalCovariance() ec = ec.fit(df_train) m2 = ec.mahalanobis(df_train) df_train = df_train[m2 < 40000] df_target = df_target[m2 < 40000] # clip # df_test = df_test.clip(df_train.min(), df_train.max(), axis=1) # standard preprocessing prep = Pipeline([ ('cd', ColumnDropper(drop=ZERO_VARIANCE_COLUMNS + CORRELATED_COLUMNS)), ('std', StandardScaler()) ]) X_train = prep.fit_transform(df_train) X_test = prep.transform(df_test) y_train = df_target.values
y[n_outliers:] = -1 # 2D plot plt.scatter(X_01[:-n_outliers, 0], X_01[:-n_outliers, 1], c=colors[0]) plt.scatter(X_01[-n_outliers:, 0], X_01[-n_outliers:, 1], c=colors[0], marker='x') cov_emp = EmpiricalCovariance().fit(X_01_in) print('Covarience: ' + cov_emp.covariance_) xx, yy = np.meshgrid(np.linspace(-7, 7, 150), np.linspace(-7, 7, 150)) Z = cov_emp.mahalanobis( np.c_[xx.ravel(), yy.ravel()] ) > chi2_interval_max # maker sure the degree of freedom for Chi2 is correct Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, levels=[0], linewidths=200, colors='black') outlier_pred = cov_emp.mahalanobis(X_01) > chi2_interval_max outlier_true = y == -1 plt.scatter(X_01[outlier_pred & outlier_true, 0], X_01[outlier_pred & outlier_true, 1], c=colors[1], marker='x', label='TP') plt.scatter(X_01[~outlier_pred & outlier_true, 0], X_01[~outlier_pred & outlier_true, 1], c=colors[0],
# Show data set subfig1 = pl.subplot(3, 1, 1) inlier_plot = subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(pl.xlim()[0], pl.xlim()[1], 100), np.linspace(pl.ylim()[0], pl.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=pl.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=pl.cm.YlOrBr_r, linestyles='dotted') subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot, outlier_plot], ['MLE dist', 'robust dist', 'inliers', 'outliers'], loc="upper right", borderaxespad=0) pl.xticks(())
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted')
discriminator.save_weights('models/dis_weights_%d.h5' % id_remove) with open('models/dis_architecture_%d.json' % id_remove, 'w') as f: f.write(discriminator.to_json()) X_test_remaining = X_test.copy() X_test = np.vstack([removed_routes, X_test_remaining]) X_test = np.reshape(X_test, (len(X_test), TRIP_SIZE, NUM_VALS, 1)) Y_test = np.hstack([np.ones(len(removed_routes)), np.zeros(len(X_test_remaining))]) # Mahalanobis encodings_train = encoder.predict(X_train) emp_cov = EmpiricalCovariance().fit(encodings_train) encodings_test = encoder.predict(X_test) emp_mahal = emp_cov.mahalanobis(encodings_test) val_arr = np.asarray(emp_mahal) val_probs = val_arr / max(val_arr) roc_auc = roc_auc_score(Y_test, val_probs) prauc = average_precision_score(Y_test, val_probs) roc_auc_scores.append(roc_auc) prauc_scores.append(prauc) print("ROC AUC SCORE FOR %d: %f" % (id_remove, roc_auc)) print("PRAUC SCORE FOR %d: %f" % (id_remove, prauc)) np.savetxt('auc/roc_auc_scores.txt', roc_auc_scores, fmt='%5s', delimiter=",") plt.scatter(np.arange(len(roc_auc_scores)), roc_auc_scores) plt.savefig("auc/roc_auc_scores.png")
emp_cov = EmpiricalCovariance().fit(X) # Display results fig = pl.figure() # Show data set subfig1 = pl.subplot(3, 1, 1) subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") subfig1.legend(loc="upper right") emp_mahal = emp_cov.mahalanobis(X) ** (0.33) subfig2 = pl.subplot(2, 2, 3) subfig2.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25) subfig2.plot(1.26 * np.ones(n_samples - n_outliers), emp_mahal[:-n_outliers], '+k', markeredgewidth=1) subfig2.plot(2.26 * np.ones(n_outliers), emp_mahal[-n_outliers:], '+k', markeredgewidth=1) subfig2.axes.set_xticklabels(('inliers', 'outliers'), size=11) subfig2.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$") subfig2.set_title("1. from non-robust estimates\n(Maximum Likelihood)") robust_mahal = robust_cov.mahalanobis(X) ** (0.33) subfig3 = pl.subplot(2, 2, 4) subfig3.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]], widths=.25) subfig3.plot(1.26 * np.ones(n_samples - n_outliers),
# Show data set subfig1 = plt.subplot(3, 1, 1) inlier_plot = subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") # Show contours of the distance functions xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100), np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) zz = np.c_[xx.ravel(), yy.ravel()] mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles='dashed') mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles='dotted') subfig1.legend([emp_cov_contour.collections[1], robust_contour.collections[1], inlier_plot, outlier_plot], ['MLE dist', 'robust dist', 'inliers', 'outliers'], loc="upper right", borderaxespad=0) plt.xticks(())
offset_bottom = fig.subplotpars.bottom width = fig.subplotpars.right - offset_left subfig1 = pl.subplot(3, 1, 1) subfig2 = pl.subplot(3, 1, 2) subfig3 = pl.subplot(3, 1, 3) # Show data set subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers') subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color='red', label='outliers') subfig1.set_xlim(subfig1.get_xlim()[0], 11.) subfig1.set_title("Mahalanobis distances of a contaminated data set:") subfig1.legend(loc="upper right") # Empirical covariance -based Mahalanobis distances subfig2.scatter(np.arange(n_samples), emp_cov.mahalanobis(X), color='black', label='inliers') subfig2.scatter(np.arange(n_samples)[-n_outliers:], emp_cov.mahalanobis(X)[-n_outliers:], color='red', label='outliers') subfig2.set_ylabel("Mahal. dist.") subfig2.set_title("1. from empirical estimates") subfig2.axes.set_position(pos=[offset_left, 0.39, width, .2]) # MCD-based Mahalanobis distances subfig3.scatter(np.arange(n_samples), robust_cov.mahalanobis(X), color='black', label='inliers') subfig3.scatter(np.arange(n_samples)[-n_outliers:], robust_cov.mahalanobis(X)[-n_outliers:], color='red', label='outliers') subfig3.set_ylabel("Mahal. dist.")