def test_maha(): # Want implementation of Mahalanobis distance to match this R session: # > x1 <- round(rnorm(10,3), 3) # > x2 <- round(x1 + rnorm(10), 3) # > x3 <- round(x2 + runif(10), 3) # > x1 # [1] 3.853 2.401 2.253 3.067 1.887 3.293 3.995 2.559 2.785 2.228 # > x2 # [1] 4.294 1.915 1.315 4.641 1.611 2.838 3.696 1.337 2.853 2.434 # > x3 # [1] 4.785 2.352 2.023 4.978 2.329 3.101 4.494 2.204 3.468 3.075 # > obs <- cbind(x1, x2, x3) # > S <- var(obs) # > S # x1 x2 x3 # x1 0.5020374 0.6667232 0.633355 # x2 0.6667232 1.4434718 1.326026 # x3 0.6333550 1.3260262 1.248315 # > mahalanobis(obs, c(mean(x1), mean(x2), mean(x3)), S) # [1] 2.1838336 1.9673401 1.3335029 4.9191627 2.1246818 5.3297995 4.9022487 # [8] 2.5335913 0.1952562 1.5105832 from scitbx.array_family import flex from dials.algorithms.statistics.fast_mcd import cov, maha_dist_sq # test Mahalanobis distance. x1 = flex.double( (3.853, 2.401, 2.253, 3.067, 1.887, 3.293, 3.995, 2.559, 2.785, 2.228)) x2 = flex.double( (4.294, 1.915, 1.315, 4.641, 1.611, 2.838, 3.696, 1.337, 2.853, 2.434)) x3 = flex.double( (4.785, 2.352, 2.023, 4.978, 2.329, 3.101, 4.494, 2.204, 3.468, 3.075)) cols = [x1, x2, x3] center = [flex.mean(e) for e in cols] covmat = cov(x1, x2, x3) maha = maha_dist_sq(cols, center, covmat) from libtbx.test_utils import approx_equal R_result = [ 2.1838336, 1.9673401, 1.3335029, 4.9191627, 2.1246818, 5.3297995, 4.9022487, 2.5335913, 0.1952562, 1.5105832, ] assert approx_equal(list(maha), R_result)
def test_maha(): # Want implementation of Mahalanobis distance to match this R session: #> x1 <- round(rnorm(10,3), 3) #> x2 <- round(x1 + rnorm(10), 3) #> x3 <- round(x2 + runif(10), 3) #> x1 # [1] 3.853 2.401 2.253 3.067 1.887 3.293 3.995 2.559 2.785 2.228 #> x2 # [1] 4.294 1.915 1.315 4.641 1.611 2.838 3.696 1.337 2.853 2.434 #> x3 # [1] 4.785 2.352 2.023 4.978 2.329 3.101 4.494 2.204 3.468 3.075 #> obs <- cbind(x1, x2, x3) #> S <- var(obs) #> S # x1 x2 x3 #x1 0.5020374 0.6667232 0.633355 #x2 0.6667232 1.4434718 1.326026 #x3 0.6333550 1.3260262 1.248315 #> mahalanobis(obs, c(mean(x1), mean(x2), mean(x3)), S) # [1] 2.1838336 1.9673401 1.3335029 4.9191627 2.1246818 5.3297995 4.9022487 # [8] 2.5335913 0.1952562 1.5105832 from scitbx.array_family import flex from dials.algorithms.statistics.fast_mcd import maha_dist_sq, cov # test Mahalanobis distance. x1 = flex.double((3.853, 2.401, 2.253, 3.067, 1.887, 3.293, 3.995, 2.559, 2.785, 2.228)) x2 = flex.double((4.294, 1.915, 1.315, 4.641, 1.611, 2.838, 3.696, 1.337, 2.853, 2.434)) x3 = flex.double((4.785, 2.352, 2.023, 4.978, 2.329, 3.101, 4.494, 2.204, 3.468, 3.075)) cols = [x1, x2, x3] center = [flex.mean(e) for e in cols] covmat = cov(x1, x2, x3) n = len(cols[0]) maha = maha_dist_sq(cols, center, covmat) from libtbx.test_utils import approx_equal R_result = [2.1838336, 1.9673401, 1.3335029, 4.9191627, 2.1246818, 5.3297995, 4.9022487, 2.5335913, 0.1952562, 1.5105832] assert approx_equal(list(maha), R_result) print "OK" return
def _detect_outliers(self, cols): fast_mcd = FastMCD( cols, alpha=self._alpha, max_n_groups=self._max_n_groups, min_group_size=self._min_group_size, n_trials=self._n_trials, k1=self._k1, k2=self._k2, k3=self._k3, ) # get location and MCD scatter estimate T, S = fast_mcd.get_corrected_T_and_S() # get squared Mahalanobis distances d2s = maha_dist_sq(cols, T, S) # compare to the threshold outliers = d2s > self._mahasq_cutoff return outliers
def _detect_outliers(self, cols): outliers = flex.bool(len(cols[0]), False) fast_mcd = FastMCD(cols, alpha = self._alpha, max_n_groups = self._max_n_groups, min_group_size = self._min_group_size, n_trials = self._n_trials, k1 = self._k1, k2 = self._k2, k3 = self._k3) # get location and MCD scatter estimate T, S = fast_mcd.get_corrected_T_and_S() # get squared Mahalanobis distances d2s = maha_dist_sq(cols, T, S) # compare to the threshold outliers = d2s > self._mahasq_cutoff return outliers
def _filter_reflections_based_on_centroid_distance(self): """ Filter reflections too far from predicted position <<<<<<< HEAD """ # Compute the x and y residuals Xobs, Yobs, _ = self.reflections["xyzobs.px.value"].parts() Xcal, Ycal, _ = self.reflections["xyzcal.px"].parts() Xres = Xobs - Xcal Yres = Yobs - Ycal # Compute the epsilon residual s0_length = 1.0 / self.experiments[0].beam.get_wavelength() s1x, s1y, s1z = self.reflections["s2"].parts() s1_length = flex.sqrt(s1x**2 + s1y**2 + s1z**2) Eres = s1_length - s0_length # Initialise the fast_mcd outlier algorithm # fast_mcd = FastMCD((Xres, Yres, Eres)) fast_mcd = FastMCD((Xres, Yres)) # get location and MCD scatter estimate T, S = fast_mcd.get_corrected_T_and_S() # get squared Mahalanobis distances # d2s = maha_dist_sq((Xres, Yres, Eres), T, S) d2s = maha_dist_sq((Xres, Yres), T, S) # Compute the cutoff mahasq_cutoff = chisq_quantile( 2, self.params.refinement.outlier_probability) # compare to the threshold and select reflections selection1 = d2s < mahasq_cutoff selection2 = (flex.sqrt(Xres**2 + Yres**2) < self.params.refinement.max_separation) selection = selection1 & selection2 self.reflections = self.reflections.select(selection) # Print some stuff logger.info("-" * 80) logger.info("Centroid outlier rejection") logger.info(" Using MCD algorithm with probability = %f" % self.params.refinement.outlier_probability) logger.info(" Max X residual: %f" % flex.max(flex.abs(Xres))) logger.info(" Max Y residual: %f" % flex.max(flex.abs(Yres))) logger.info(" Max E residual: %f" % flex.max(flex.abs(Eres))) logger.info(" Mean X RMSD: %f" % (sqrt(flex.sum(Xres**2) / len(Xres)))) logger.info(" Mean Y RMSD: %f" % (sqrt(flex.sum(Yres**2) / len(Yres)))) logger.info(" Mean E RMSD: %f" % (sqrt(flex.sum(Eres**2) / len(Eres)))) logger.info(" MCD location estimate: %.4f, %.4f" % tuple(T)) logger.info(""" MCD scatter estimate: %.7f, %.7f, %.7f, %.7f""" % tuple(list(S))) # logger.info(" MCD location estimate: %.4f, %.4f, %.4f" % tuple(T)) # logger.info(''' MCD scatter estimate: # %.7f, %.7f, %.7f, # %.7f, %.7f, %.7f, # %.7f, %.7f, %.7f''' % tuple(list(S))) logger.info(" Number of outliers: %d" % selection1.count(False)) logger.info( " Number of reflections with residual > %0.2f pixels: %d" % (self.params.refinement.max_separation, selection2.count(False))) logger.info(" Number of reflections selection for refinement: %d" % len(self.reflections)) logger.info("-" * 80) # Throw exception if len(self.reflections) < self.params.refinement.min_n_reflections: raise RuntimeError( "Too few reflections to perform refinement: got %d, expected %d" % (len(self.reflections), self.params.refinement.min_n_reflections))
def _filter_reflections_based_on_centroid_distance( reflection_table, experiment, outlier_probability=0.975, max_separation=2, ): """ Filter reflections too far from predicted position """ # Compute the x and y residuals Xobs, Yobs, _ = reflection_table["xyzobs.px.value"].parts() Xcal, Ycal, _ = reflection_table["xyzcal.px"].parts() Xres = Xobs - Xcal Yres = Yobs - Ycal # Compute the epsilon residual s0_length = 1.0 / experiment.beam.get_wavelength() s1x, s1y, s1z = reflection_table["s2"].parts() s1_length = flex.sqrt(s1x**2 + s1y**2 + s1z**2) Eres = s1_length - s0_length # Initialise the fast_mcd outlier algorithm # fast_mcd = FastMCD((Xres, Yres, Eres)) fast_mcd = FastMCD((Xres, Yres)) # get location and MCD scatter estimate T, S = fast_mcd.get_corrected_T_and_S() # get squared Mahalanobis distances # d2s = maha_dist_sq((Xres, Yres, Eres), T, S) d2s = maha_dist_sq((Xres, Yres), T, S) # Compute the cutoff mahasq_cutoff = chisq_quantile(2, outlier_probability) # compare to the threshold and select reflections selection1 = d2s < mahasq_cutoff selection2 = flex.sqrt(Xres**2 + Yres**2) < max_separation selection = selection1 & selection2 reflection_table = reflection_table.select(selection) n_refl = reflection_table.size() # Print some stuff logger.info("-" * 80) logger.info("Centroid outlier rejection") logger.info( f" Using MCD algorithm with probability = {outlier_probability}") logger.info(" Max X residual: %f" % flex.max(flex.abs(Xres))) logger.info(" Max Y residual: %f" % flex.max(flex.abs(Yres))) logger.info(" Max E residual: %f" % flex.max(flex.abs(Eres))) logger.info(" Mean X RMSD: %f" % (sqrt(flex.sum(Xres**2) / len(Xres)))) logger.info(" Mean Y RMSD: %f" % (sqrt(flex.sum(Yres**2) / len(Yres)))) logger.info(" Mean E RMSD: %f" % (sqrt(flex.sum(Eres**2) / len(Eres)))) logger.info(" MCD location estimate: %.4f, %.4f" % tuple(T)) logger.info(""" MCD scatter estimate: %.7f, %.7f, %.7f, %.7f""" % tuple(S)) logger.info(" Number of outliers: %d" % selection1.count(False)) logger.info(" Number of reflections with residual > %0.2f pixels: %d" % (max_separation, selection2.count(False))) logger.info(f"Number of reflections selection for refinement: {n_refl}") logger.info("-" * 80) return reflection_table