def test_fast_mcd(data): """ """ n = data.shape[0] p = data.shape[1] ### Naive location and scatter estimates location = data.mean(0) covariance = np.cov(data.T) # invert the covariance matrix try: inv_sigma = linalg.inv(robust_covariance) except: u, s, vh = linalg.svd(covariance) inv_s = (1. / s) * \ ((np.cumsum(s) < np.sum(s) * .95) | ([True]+[False]*(len(s)-1))) inv_sigma = np.dot(np.dot(vh.T, np.diag(inv_s)), u.T) # get distribution of data's Mahalanobis distances Y = data - location R = np.sqrt((np.dot(Y, inv_sigma) * Y).sum(1)) # estimate the density with a gaussian kernel nonnan_subjects_arg = np.where(~np.isnan(R))[0] R = R[nonnan_subjects_arg] x1 = np.arange(0., 1.2*np.amax(R), 0.0012*np.amax(R)) n = R.size sigma = 1.05 * np.std(R) * n**(-0.2) kernel_arg = (np.tile(x1, (n,1)).T - R) / sigma fh = ((1/np.sqrt(2*np.pi)) * np.exp(-0.5*kernel_arg**2)).sum(1) / (n*sigma) # plot the distribution if PLOT: plt.figure() plt.plot(x1, fh, color='blue') # Khi-2 distribution diff_scale = np.sqrt(R.var() / float(chi2.stats(p, moments='v'))) diff_loc = R.mean() - float(chi2.stats(p, scale=diff_scale, moments='m')) template = chi2(p, loc=diff_loc, scale=diff_scale) if PLOT: plt.plot(x1, template.pdf(x1), linestyle='--', color='blue') mse_naive = ((fh - template.pdf(x1))**2).mean() imse_naive = 0.5 * ((fh - template.pdf(x1))**2).sum() * (x1[1] - x1[0]) if PLOT: print "MSE (naive case) =", mse_naive print "IMSE (naive case) =", imse_naive ### Robust location and scatter estimates robust_location, robust_covariance = fast_mcd(data) try: inv_sigma = linalg.inv(robust_covariance) except: u, s, vh = linalg.svd(robust_covariance) inv_s = (1. / s) * \ ((np.cumsum(s) < np.sum(s) * .95) | ([True]+[False]*(len(s)-1))) inv_sigma = np.dot(np.dot(vh.T, np.diag(inv_s)), u.T) # get distribution of data's Mahalanobis distances Y = data - robust_location R = np.sqrt((np.dot(Y, inv_sigma) * Y).sum(1)) # estimate the density with a gaussian kernel nonnan_subjects_arg = np.where(~np.isnan(R))[0] R = R[nonnan_subjects_arg] x2 = np.arange(0., 1.2*np.amax(R), 0.0012*np.amax(R)) n = R.size sigma = 1.05 * np.std(R) * n**(-0.2) kernel_arg = (np.tile(x2, (n,1)).T - R) / sigma fh = ((1/np.sqrt(2*np.pi)) * np.exp(-0.5*kernel_arg**2)).sum(1) / (n*sigma) # plot the distribution if PLOT: plt.plot(x2, fh, color='green') # Khi-2 distribution diff_scale = np.sqrt(R.var() / float(chi2.stats(p, moments='v'))) diff_loc = R.mean() - float(chi2.stats(p, scale=diff_scale, moments='m')) template = chi2(p, loc=diff_loc, scale=diff_scale) if PLOT: plt.plot(x2, template.pdf(x2), linestyle='--', color='green') mse_robust = ((fh - template.pdf(x2))**2).mean() imse_robust = 0.5 * ((fh - template.pdf(x2))**2).sum() * (x2[1] - x2[0]) if PLOT: print "MSE (robust case) =", mse_robust print "IMSE (robust case) =", imse_robust plt.legend(('empirical distribution (naive)', 'chi-2 (naive)', 'empirical distribution (robust)', 'chi-2 (robust)'), loc='upper center', bbox_to_anchor=(0.5, 0.)) plt.show() return mse_naive, mse_robust, imse_naive, imse_robust
allsummary[i,:] = np.asarray([v[~np.isnan(v)].mean() for v in values]) del img # deal with the NaNs, since they confuse argsort nonan_mask = np.isnan(allsummary).sum(1) == 0 fallsummary = allsummary[nonan_mask,:] # --------> /!\ fixme: find a better way to trim # per ROI trimmed-list of subjects (10 each side) M = np.argsort(fallsummary, 0)[10:-10] trimmed_ind = reduce(np.intersect1d, M.T) trimmed_allsummary = fallsummary[trimmed_ind] del M #trimmed_allsummary = fallsummary # SVD decomposition of the covariance matrix covariance = np.cov(trimmed_allsummary.T) robust_location, robust_covariance = fast_mcd(fallsummary) u, s, vh = linalg.svd(robust_covariance) # --------> /!\ fixme: look at that criterion (75%) # keep only 75% of the covariance inv_s = (1. / s) * \ ((np.cumsum(s) < np.sum(s) * .95) | ([True]+[False]*(len(s)-1))) inv_sigma = np.dot(np.dot(vh.T, np.diag(inv_s)), u.T) # --------> /!\ fixme: median ? # compute Mahalanobis distances Y = fallsummary - robust_location #Y = fallsummary - np.mean(fallsummary, 0) R = np.sqrt((np.dot(Y, inv_sigma) * Y).sum(1)) # find outliers threshold sortedR = R[~np.isnan(R)].copy() sortedR.sort()