def tst_outliers_find_posterior_mode(): # first check if we can find the posterior mode for the acentric # when alpha is close to 1, this should be very close to fcalc fobs = flex.double([3] * 10) fcalc = flex.double(range(10)) * 10 + 10 sigmas = flex.double([0] * 10) epsilon = flex.double([1] * 10) centric = flex.bool([False] * 10) beta = flex.double([1] * 10) alpha = flex.double([0.99] * 10) tmp_object = scaling.likelihood_ratio_outlier_test(fobs, sigmas, fcalc, epsilon, centric, alpha, beta) posterior_mode = tmp_object.posterior_mode() for f, m in zip(fcalc, posterior_mode): assert approx_equal(f / m, 1, eps=0.05) # have a look at centrics fobs = flex.double([3] * 10) fcalc = flex.double(range(10)) * 100 + 100 sigmas = flex.double([0] * 10) epsilon = flex.double([1] * 10) centric = flex.bool([True] * 10) beta = flex.double([1] * 10) alpha = flex.double([0.099] * 10) tmp_object = scaling.likelihood_ratio_outlier_test(fobs, sigmas, fcalc, epsilon, centric, alpha, beta) posterior_mode = tmp_object.posterior_mode() for f, m in zip(fcalc, posterior_mode): assert approx_equal(m / f, 0.099, eps=0.001)
def tst_outliers_compare_mode_mean(): fobs = flex.double( range(1000) )/300.0 fcalc = flex.double( range(1000) )/300.0 sigmas = None epsilon = flex.double( range(1000) )*0 +1.0 centric = flex.bool( [False]*1000 ) for ii in xrange(50): a = ii/50.0 b = 1.0-a*a alpha = flex.double( [a]*1000 ) beta = flex.double( [b]*1000 ) tmp_object = scaling.likelihood_ratio_outlier_test( fobs, sigmas, fcalc, epsilon, centric, alpha, beta ) mean = tmp_object.mean_fobs() std = tmp_object.std_fobs() mode = tmp_object.posterior_mode() sdmo = tmp_object.posterior_mode_snd_der() for a,mm,m,v in zip(alpha,mean,mode,sdmo): assert (-1.0/v>0) if (a>0.9): assert approx_equal(mm,m,eps=1e-1) for ii in xrange(1,50): a = ii/50.0 b = 1.0-a*a alpha = flex.double( [a]*1000 ) beta = flex.double( [b]*1000 ) tmp_object = scaling.likelihood_ratio_outlier_test( fobs, sigmas, fcalc, epsilon, ~centric, alpha, beta ) mean = tmp_object.mean_fobs() std = tmp_object.std_fobs() mode = tmp_object.posterior_mode() sdmo = tmp_object.posterior_mode_snd_der() for a,b,fc,mm,m,v in zip(alpha,beta,fcalc,mean,mode,sdmo): assert (-1.0/v>0) if (a>0.9): if (fc>1.0): assert approx_equal(mm,m,eps=1e-1)
def tst_loglikelihoods(): fobs = flex.double( range(1000) )/200 fcalc = flex.double( [1]*1000 ) sigmas = flex.double( [0]*1000 ) epsilon = flex.double( [1]*1000 ) centric = flex.bool( [False]*1000 ) beta = flex.double( [1]*1000 ) alpha = flex.double( [0.99]*1000 ) tmp_object = scaling.likelihood_ratio_outlier_test( fobs, sigmas, fcalc, epsilon, centric, alpha, beta) cur_lik = tmp_object.log_likelihood() pm_lik = tmp_object.posterior_mode_log_likelihood() mode = tmp_object.posterior_mode() level = 4.5 flags = tmp_object.flag_potential_outliers( 2.0*level ) for fl,pl,l,m,fo in zip(flags,pm_lik,cur_lik,mode,fobs): if pl-l < level*2.0: assert fl else: assert not fl
def tst_outliers_find_posterior_mode(): # first check if we can find the posterior mode for the acentric # when alpha is close to 1, this should be very close to fcalc fobs = flex.double( [3]*10 ) fcalc = flex.double( range(10) )*10 + 10 sigmas = flex.double( [0]*10 ) epsilon = flex.double( [1]*10 ) centric = flex.bool( [False]*10 ) beta = flex.double( [1]*10 ) alpha = flex.double( [0.99]*10 ) tmp_object = scaling.likelihood_ratio_outlier_test( fobs, sigmas, fcalc, epsilon, centric, alpha, beta) posterior_mode = tmp_object.posterior_mode() for f, m in zip(fcalc,posterior_mode): assert approx_equal(f/m, 1, eps=0.05) # have a look at centrics fobs = flex.double( [3]*10 ) fcalc = flex.double( range(10) )*100 + 100 sigmas = flex.double( [0]*10 ) epsilon = flex.double( [1]*10 ) centric = flex.bool( [True]*10 ) beta = flex.double( [1]*10 ) alpha = flex.double( [0.099]*10 ) tmp_object = scaling.likelihood_ratio_outlier_test( fobs, sigmas, fcalc, epsilon, centric, alpha, beta) posterior_mode = tmp_object.posterior_mode() for f, m in zip(fcalc,posterior_mode): assert approx_equal(m/f, 0.099, eps=0.001)
def tst_outliers_compare_mode_mean(): fobs = flex.double(range(1000)) / 300.0 fcalc = flex.double(range(1000)) / 300.0 sigmas = None epsilon = flex.double(range(1000)) * 0 + 1.0 centric = flex.bool([False] * 1000) for ii in xrange(50): a = ii / 50.0 b = 1.0 - a * a alpha = flex.double([a] * 1000) beta = flex.double([b] * 1000) tmp_object = scaling.likelihood_ratio_outlier_test( fobs, sigmas, fcalc, epsilon, centric, alpha, beta) mean = tmp_object.mean_fobs() std = tmp_object.std_fobs() mode = tmp_object.posterior_mode() sdmo = tmp_object.posterior_mode_snd_der() for a, mm, m, v in zip(alpha, mean, mode, sdmo): assert (-1.0 / v > 0) if (a > 0.9): assert approx_equal(mm, m, eps=1e-1) for ii in xrange(1, 50): a = ii / 50.0 b = 1.0 - a * a alpha = flex.double([a] * 1000) beta = flex.double([b] * 1000) tmp_object = scaling.likelihood_ratio_outlier_test( fobs, sigmas, fcalc, epsilon, ~centric, alpha, beta) mean = tmp_object.mean_fobs() std = tmp_object.std_fobs() mode = tmp_object.posterior_mode() sdmo = tmp_object.posterior_mode_snd_der() for a, b, fc, mm, m, v in zip(alpha, beta, fcalc, mean, mode, sdmo): assert (-1.0 / v > 0) if (a > 0.9): if (fc > 1.0): assert approx_equal(mm, m, eps=1e-1)
def tst_loglikelihoods(): fobs = flex.double(range(1000)) / 200 fcalc = flex.double([1] * 1000) sigmas = flex.double([0] * 1000) epsilon = flex.double([1] * 1000) centric = flex.bool([False] * 1000) beta = flex.double([1] * 1000) alpha = flex.double([0.99] * 1000) tmp_object = scaling.likelihood_ratio_outlier_test(fobs, sigmas, fcalc, epsilon, centric, alpha, beta) cur_lik = tmp_object.log_likelihood() pm_lik = tmp_object.posterior_mode_log_likelihood() mode = tmp_object.posterior_mode() level = 4.5 flags = tmp_object.flag_potential_outliers(2.0 * level) for fl, pl, l, m, fo in zip(flags, pm_lik, cur_lik, mode, fobs): if pl - l < level * 2.0: assert fl else: assert not fl
def plotit(fobs, sigma, fcalc, alpha, beta, epsilon, centric, out, limit=5.0, steps=1000, plot_title="Outlier plot"): fobs_a = flex.double( [fobs] ) fcalc_a = flex.double( [fcalc] ) epsilon_a = flex.double( [epsilon] ) alpha_a = flex.double( [alpha] ) beta_a = flex.double( [beta] ) centric_a = flex.bool ( [centric] ) p_calc = scaling.likelihood_ratio_outlier_test( fobs_a, None, fcalc_a, epsilon_a, centric_a, alpha_a, beta_a) print >> out print >> out,"#Input parameters: " print >> out,"#Title : ", plot_title print >> out,"#F-calc : ", fcalc print >> out,"#F-obs : ", fobs print >> out,"#epsilon : ", epsilon print >> out,"#alpha : ", alpha print >> out,"#beta : ", beta print >> out,"#centric : ", centric mode = p_calc.posterior_mode()[0] snd_der = math.sqrt(1.0/ math.fabs( p_calc.posterior_mode_snd_der()[0] ) ) print >> out,"#A Gaussian approximation of the likelihood function" print >> out,"#could be constructed as follows with: " print >> out,"# exp[-(fobs-mode)**2/(2*stdev**2)] /(sqrt(2 pi) stdev)" print >> out,"#with" print >> out,"#mode = ", mode print >> out,"#stdev = ", snd_der print >> out print >> out,"#The log likelihood values for the mode and " print >> out,"#observed values are" print >> out,"#Log[P(fobs)] : ", p_calc.log_likelihood()[0] print >> out,"#Log[P(mode)] : ", p_calc.posterior_mode_log_likelihood()[0] print >> out,"#Their difference is:" print >> out,"#delta : ", p_calc.log_likelihood()[0]-p_calc.posterior_mode_log_likelihood()[0] print >> out,"#" mean_fobs = p_calc.mean_fobs() print >> out,"#mean f_obs : ", mean_fobs[0], " (first moment)" low_limit = mode-snd_der*limit if low_limit<0: low_limit=0 high_limit = mode+limit*snd_der if fobs < low_limit: low_limit = fobs-2.0*snd_der if low_limit<0: low_limit=0 if fobs > high_limit: high_limit = fobs+2.0*snd_der fobs_a = flex.double( range(steps) )*( high_limit-low_limit)/float(steps)+low_limit fcalc_a = flex.double( [fcalc]*steps ) epsilon_a = flex.double( [epsilon]*steps ) alpha_a = flex.double( [alpha]*steps ) beta_a = flex.double( [beta]*steps ) centric_a = flex.bool ( [centric]*steps ) p_calc = scaling.likelihood_ratio_outlier_test( fobs_a, None, fcalc_a, epsilon_a, centric_a, alpha_a, beta_a) ll = p_calc.log_likelihood() #-p_calc.posterior_mode_log_likelihood() ll = flex.exp( ll ) if (sigma is None) or (sigma <=0 ): sigma=fobs/30.0 obs_gauss = (fobs_a - fobs)/float(sigma) obs_gauss = flex.exp( -obs_gauss*obs_gauss/2.0 ) /( math.sqrt(2.0*math.pi*sigma*sigma)) max_ll = flex.max( ll )*1.10 truncate_mask = flex.bool( obs_gauss >= max_ll ) obs_gauss = obs_gauss.set_selected( truncate_mask, max_ll ) ccp4_loggraph_plot = data_plots.plot_data( plot_title=plot_title, x_label = 'Fobs', y_label = 'P(Fobs)', x_data = fobs_a, y_data = ll, y_legend = 'P(Fobs|Fcalc,alpha,beta)', comments = 'Fobs=%5.2f, sigma=%5.2f, Fcalc=%5.2f'%(fobs,sigma,fcalc) ) ccp4_loggraph_plot.add_data( y_data = obs_gauss, y_legend = "P(Fobs|<Fobs>,sigma)" ) data_plots.plot_data_loggraph(ccp4_loggraph_plot,out)
def model_based_outliers(self, f_model, level=.01, return_data=False, plot_out=None): assert self.r_free_flags is not None if (self.r_free_flags.data().count(True) == 0): self.r_free_flags = self.r_free_flags.array( data=~self.r_free_flags.data()) sigmaa_estimator = sigmaa_estimation.sigmaa_estimator( miller_obs=self.miller_obs, miller_calc=f_model, r_free_flags=self.r_free_flags, kernel_width_free_reflections=200, n_sampling_points=20, n_chebyshev_terms=13) sigmaa_estimator.show(out=self.out) sigmaa = sigmaa_estimator.sigmaa() obs_norm = abs(sigmaa_estimator.normalized_obs) calc_norm = sigmaa_estimator.normalized_calc f_model_outlier_object = scaling.likelihood_ratio_outlier_test( f_obs=obs_norm.data(), sigma_obs=None, f_calc=calc_norm.data(), # the data is prenormalized, all epsies are unity epsilon=flex.double(calc_norm.data().size(), 1.0), centric=obs_norm.centric_flags().data(), alpha=sigmaa.data(), beta=1.0 - sigmaa.data() * sigmaa.data()) modes = f_model_outlier_object.posterior_mode() lik = f_model_outlier_object.log_likelihood() p_lik = f_model_outlier_object.posterior_mode_log_likelihood() s_der = f_model_outlier_object.posterior_mode_snd_der() ll_gain = f_model_outlier_object.standardized_likelihood() # The smallest vallue should be 0. # sometimes, due to numerical issues, it comes out # a wee bit negative. please repair that eps = 1.0e-10 zeros = flex.bool(ll_gain < eps) p_values = ll_gain p_values = p_values.set_selected(zeros, eps) p_values = erf(flex.sqrt(p_values / 2.0)) p_values = 1.0 - flex.pow(p_values, float(p_values.size())) # select on p-values flags = flex.bool(p_values > level) flags = self.miller_obs.customized_copy(data=flags) ll_gain = self.miller_obs.customized_copy(data=ll_gain) p_values = self.miller_obs.customized_copy(data=p_values) log_message = """ Model based outlier rejection. ------------------------------ Calculated amplitudes and estimated values of alpha and beta are used to compute the log-likelihood of the observed amplitude. The method is inspired by Read, Acta Cryst. (1999). D55, 1759-1764. Outliers are rejected on the basis of the assumption that a scaled log likelihood differnce 2(log[P(Fobs)]-log[P(Fmode)])/Q\" is distributed according to a Chi-square distribution (Q\" is equal to the second derivative of the log likelihood function of the mode of the distribution). The outlier threshold of the p-value relates to the p-value of the extreme value distribution of the chi-square distribution. """ flags.map_to_asu() ll_gain.map_to_asu() p_values.map_to_asu() assert flags.indices().all_eq(self.miller_obs.indices()) assert ll_gain.indices().all_eq(self.miller_obs.indices()) assert p_values.indices().all_eq(self.miller_obs.indices()) log_message = self.make_log_model(log_message, flags, ll_gain, p_values, obs_norm, calc_norm, sigmaa, plot_out) tmp_log = StringIO() print >> tmp_log, log_message # histogram of log likelihood gain values print >> tmp_log print >> tmp_log, "The histoghram of scaled (LL-gain) values is shown below." print >> tmp_log, " Note: scaled (LL-gain) is approximately Chi-square distributed." print >> tmp_log print >> tmp_log, " scaled(LL-gain) Frequency" histo = flex.histogram(ll_gain.data(), 15) histo.show(f=tmp_log, format_cutoffs='%7.3f') print >> self.out, tmp_log.getvalue() if not return_data: return flags else: assert flags.indices().all_eq(self.miller_obs.indices()) return self.miller_obs.select(flags.data())
def model_based_outliers(self, f_model, level=0.01, return_data=False, plot_out=None): assert self.r_free_flags is not None if self.r_free_flags.data().count(True) == 0: self.r_free_flags = self.r_free_flags.array(data=~self.r_free_flags.data()) sigmaa_estimator = sigmaa_estimation.sigmaa_estimator( miller_obs=self.miller_obs, miller_calc=f_model, r_free_flags=self.r_free_flags, kernel_width_free_reflections=200, n_sampling_points=20, n_chebyshev_terms=13, ) sigmaa_estimator.show(out=self.out) sigmaa = sigmaa_estimator.sigmaa() obs_norm = abs(sigmaa_estimator.normalized_obs) calc_norm = sigmaa_estimator.normalized_calc f_model_outlier_object = scaling.likelihood_ratio_outlier_test( f_obs=obs_norm.data(), sigma_obs=None, f_calc=calc_norm.data(), # the data is prenormalized, all epsies are unity epsilon=flex.double(calc_norm.data().size(), 1.0), centric=obs_norm.centric_flags().data(), alpha=sigmaa.data(), beta=1.0 - sigmaa.data() * sigmaa.data(), ) modes = f_model_outlier_object.posterior_mode() lik = f_model_outlier_object.log_likelihood() p_lik = f_model_outlier_object.posterior_mode_log_likelihood() s_der = f_model_outlier_object.posterior_mode_snd_der() ll_gain = f_model_outlier_object.standardized_likelihood() # The smallest vallue should be 0. # sometimes, due to numerical issues, it comes out # a wee bit negative. please repair that eps = 1.0e-10 zeros = flex.bool(ll_gain < eps) p_values = ll_gain p_values = p_values.set_selected(zeros, eps) p_values = erf(flex.sqrt(p_values / 2.0)) p_values = 1.0 - flex.pow(p_values, float(p_values.size())) # select on p-values flags = flex.bool(p_values > level) flags = self.miller_obs.customized_copy(data=flags) ll_gain = self.miller_obs.customized_copy(data=ll_gain) p_values = self.miller_obs.customized_copy(data=p_values) log_message = """ Model based outlier rejection. ------------------------------ Calculated amplitudes and estimated values of alpha and beta are used to compute the log-likelihood of the observed amplitude. The method is inspired by Read, Acta Cryst. (1999). D55, 1759-1764. Outliers are rejected on the basis of the assumption that a scaled log likelihood differnce 2(log[P(Fobs)]-log[P(Fmode)])/Q\" is distributed according to a Chi-square distribution (Q\" is equal to the second derivative of the log likelihood function of the mode of the distribution). The outlier threshold of the p-value relates to the p-value of the extreme value distribution of the chi-square distribution. """ flags.map_to_asu() ll_gain.map_to_asu() p_values.map_to_asu() assert flags.indices().all_eq(self.miller_obs.indices()) assert ll_gain.indices().all_eq(self.miller_obs.indices()) assert p_values.indices().all_eq(self.miller_obs.indices()) log_message = self.make_log_model(log_message, flags, ll_gain, p_values, obs_norm, calc_norm, sigmaa, plot_out) tmp_log = StringIO() print >> tmp_log, log_message # histogram of log likelihood gain values print >> tmp_log print >> tmp_log, "The histoghram of scaled (LL-gain) values is shown below." print >> tmp_log, " Note: scaled (LL-gain) is approximately Chi-square distributed." print >> tmp_log print >> tmp_log, " scaled(LL-gain) Frequency" histo = flex.histogram(ll_gain.data(), 15) histo.show(f=tmp_log, format_cutoffs="%7.3f") print >>self.out, tmp_log.getvalue() if not return_data: return flags else: assert flags.indices().all_eq(self.miller_obs.indices()) return self.miller_obs.select(flags.data())