def anomalous_probability_plot(intensities, expected_delta=None): from scitbx.math import distributions from scitbx.array_family import flex assert intensities.is_unique_set_under_symmetry() assert intensities.anomalous_flag() dI = intensities.anomalous_differences() y = dI.data()/dI.sigmas() perm = flex.sort_permutation(y) y = y.select(perm) distribution = distributions.normal_distribution() x = distribution.quantiles(y.size()) if expected_delta is not None: sel = flex.abs(x) < expected_delta x = x.select(sel) y = y.select(sel) fit = flex.linear_regression(x, y) correlation = flex.linear_correlation(x, y) assert fit.is_well_defined() if 0: from matplotlib import pyplot pyplot.scatter(x, y) m = fit.slope() c = fit.y_intercept() pyplot.plot(pyplot.xlim(), [m * x_ + c for x_ in pyplot.xlim()]) pyplot.show() return fit.slope(), fit.y_intercept(), x.size()
def run(self): """Calculate statistics of array of observed values with uncertainties""" query_values, ref_values, q_cut, file_manager = self.data # Extract the theoretical quantiles that we would expect if these values were from a normal distribution the_diff_vals = normal_distribution().quantiles(len(query_values)) # Select the points in the middle of the distribution mid_idxs = (the_diff_vals < q_cut).iselection().intersection( (the_diff_vals > -1 * q_cut).iselection()) mid_the_diff_vals = the_diff_vals.select(mid_idxs) # Calculate the difference from the reference values act_diff_vals = query_values - ref_values srt_act_diff_vals = flex.double(sorted(act_diff_vals)) mid_act_diff_vals = srt_act_diff_vals.select(mid_idxs) # Calculate the slope of the centre of the graph map_unc, map_off = numpy.polyfit(x=mid_the_diff_vals, y=mid_act_diff_vals, deg=1) try: import matplotlib matplotlib.interactive(False) from matplotlib import pyplot pyplot.style.use('ggplot') output_graphs = True except: output_graphs = False if output_graphs and file_manager: # Sort query and ref values for plotting srt_query_vals = sorted(query_values) srt_ref_vals = sorted(ref_values) analyse_graphs.mean_obs_scatter( f_name=file_manager.get_file('obs_qqplot_unsorted_png'), mean_vals=ref_values, obs_vals=query_values) analyse_graphs.sorted_mean_obs_scatter( f_name=file_manager.get_file('obs_qqplot_sorted_png'), mean_vals=srt_ref_vals, obs_vals=srt_query_vals) analyse_graphs.uncertainty_qqplot( f_name=file_manager.get_file('unc_qqplot_png'), map_off=map_off, map_unc=map_unc, q_cut=q_cut, obs_diff=srt_act_diff_vals, quantile=the_diff_vals) # Print a running row of dots print('>', end='') sys.stdout.flush() return map_unc
def __init__(self, hooft_analysis, use_students_t_distribution=False, students_t_nu=None, probability_plot_slope=None): self.delta_fo2, minus_fo2 =\ hooft_analysis.delta_fo2.generate_bijvoet_mates().hemispheres_acentrics() self.delta_fc2, minus_fc2 =\ hooft_analysis.delta_fc2.generate_bijvoet_mates().hemispheres_acentrics() # we want to plot both hemispheres self.delta_fo2.indices().extend(minus_fo2.indices()) self.delta_fo2.data().extend(minus_fo2.data() * -1) self.delta_fo2.sigmas().extend(minus_fo2.sigmas()) self.delta_fc2.indices().extend(minus_fc2.indices()) self.delta_fc2.data().extend(minus_fc2.data() * -1) self.indices = self.delta_fo2.indices() observed_deviations = (hooft_analysis.G * self.delta_fc2.data() - self.delta_fo2.data())/self.delta_fo2.sigmas() if probability_plot_slope is not None: observed_deviations /= probability_plot_slope selection = flex.sort_permutation(observed_deviations) observed_deviations = observed_deviations.select(selection) if use_students_t_distribution: if students_t_nu is None: students_t_nu = maximise_students_t_correlation_coefficient( observed_deviations, 1, 200) self.distribution = distributions.students_t_distribution(students_t_nu) else: self.distribution = distributions.normal_distribution() self.x = self.distribution.quantiles(observed_deviations.size()) self.y = observed_deviations self.fit = flex.linear_regression(self.x[5:-5], self.y[5:-5]) self.correlation = flex.linear_correlation(self.x[5:-5], self.y[5:-5]) assert self.fit.is_well_defined()
def anomalous_probability_plot(intensities, expected_delta=None): from scitbx.math import distributions from scitbx.array_family import flex assert intensities.is_unique_set_under_symmetry() assert intensities.anomalous_flag() dI = intensities.anomalous_differences() y = dI.data() / dI.sigmas() perm = flex.sort_permutation(y) y = y.select(perm) distribution = distributions.normal_distribution() x = distribution.quantiles(y.size()) if expected_delta is not None: sel = flex.abs(x) < expected_delta x = x.select(sel) y = y.select(sel) fit = flex.linear_regression(x, y) correlation = flex.linear_correlation(x, y) assert fit.is_well_defined() if 0: from matplotlib import pyplot pyplot.scatter(x, y) m = fit.slope() c = fit.y_intercept() pyplot.plot(pyplot.xlim(), [m * x_ + c for x_ in pyplot.xlim()]) pyplot.show() return fit.slope(), fit.y_intercept(), x.size()
def normal_probability_plot(self, data, rankits_sel=None, plot=False): """ Use normal probability analysis to determine if a set of data is normally distributed See https://en.wikipedia.org/wiki/Normal_probability_plot. Rankits are computed in the same way as qqnorm does in R. @param data flex array @param rankits_sel only use the rankits in a certain range. Useful for outlier rejection. Should be a tuple such as (-0.5,0.5). @param plot whether to show the normal probabilty plot """ from scitbx.math import distributions import numpy as np norm = distributions.normal_distribution() n = len(data) if n <= 10: a = 3/8 else: a = 0.5 sorted_data = flex.sorted(data) rankits = flex.double([norm.quantile((i+1-a)/(n+1-(2*a))) for i in range(n)]) if rankits_sel is None: corr, slope, offset = self.get_overall_correlation_flex(sorted_data, rankits) else: sel = (rankits >= rankits_sel[0]) & (rankits <= rankits_sel[1]) corr, slope, offset = self.get_overall_correlation_flex(sorted_data.select(sel), rankits.select(sel)) if plot: from matplotlib import pyplot as plt f = plt.figure(0) lim = -5, 5 x = np.linspace(lim[0],lim[1],100) # 100 linearly spaced numbers y = slope * x + offset plt.plot(sorted_data, rankits, '-') #plt.plot(x,y) plt.title("CC: %.3f Slope: %.3f Offset: %.3f"%(corr, slope, offset)) plt.xlabel("Sorted data") plt.ylabel("Rankits") plt.xlim(lim); plt.ylim(lim) plt.axes().set_aspect('equal') f = plt.figure(1) h = flex.histogram(sorted_data, n_slots=100, data_min = lim[0], data_max = lim[1]) stats = flex.mean_and_variance(sorted_data) plt.plot(h.slot_centers().as_numpy_array(), h.slots().as_numpy_array(), '-') plt.xlim(lim) plt.xlabel("Sorted data") plt.ylabel("Count") plt.title("Normalized data mean: %.3f +/- %.3f"%(stats.mean(), stats.unweighted_sample_standard_deviation())) if self.scaler.params.raw_data.error_models.sdfac_refine.plot_refinement_steps: plt.ion() plt.pause(0.05) return corr, slope, offset
def npp(values, input_mean_variance): import math from scitbx.math import distributions from scitbx.array_family import flex distribution = distributions.normal_distribution() values = flex.sorted(values) mean, variance = input_mean_variance scaled = (values - mean) / math.sqrt(variance) expected = distribution.quantiles(values.size()) return expected, scaled
def npp_ify(values, input_mean_variance=None): '''Analyse data in values (assumed to be drawn from one population) and return the sorted list of (expected, observed) deviation from the mean.''' distribution = distributions.normal_distribution() values = flex.sorted(values) if input_mean_variance: mean, variance = input_mean_variance else: mean, variance = mean_variance(values) scaled = (values - mean) / math.sqrt(variance) expected = distribution.quantiles(values.size()) return expected, scaled
def map_value_distribution(f_name, plot_vals, plot_normal=False): """Plot histogram of values, with optional normal distribution""" from scitbx.math.distributions import normal_distribution fig = pyplot.figure() pyplot.title('Distribution of map values') pyplot.hist(x=plot_vals, bins=30, normed=True) if plot_normal: # Plot the distribution for N(0,1) nd_t = normal_distribution() theor_x = numpy.linspace(-5, 5, 101) theor_y = [nd_t.pdf(x) for x in theor_x] pyplot.plot(theor_x, theor_y, c='k', ls='--', marker='o') # Plot the distribution for the observed distribution nd_o = normal_distribution(mean=numpy.mean(plot_vals), sd=numpy.std(plot_vals)) obs_x = numpy.linspace(-5, 5, 101) obs_y = [nd_o.pdf(x) for x in obs_x] pyplot.plot(obs_x, obs_y, c='g', ls='-', marker='o') pyplot.xlabel('Map value') pyplot.ylabel('Density') #pyplot.tight_layout() pyplot.subplots_adjust() pyplot.savefig(f_name) pyplot.close(fig)
def convert_pvalue_to_zscore(pval, two_tailed=True): """Convert a p-value to a z-score for a standard normal N(0,1)""" # If two-tailed test, need to halve the p-value if two_tailed: pval = pval / 2.0 # Create normal distribution to convert - N(0,1) nrm = distributions.normal_distribution() # Calculate the probability quantile (z-score) corresponding to 1-pval try: zsco = nrm.quantile(1.0 - pval) except RuntimeError: # pval too small - return default (8.0 is max calculable) if pval < 0.00000000000000006: zsco = 8.2 else: raise return zsco
def calculate_sorted_deviations(self, parameters): """Sort the x,y data.""" sigmaprime = calc_sigmaprime(parameters, self.filtered_Ih_table) delta_hl = calc_deltahl(self.filtered_Ih_table, self.filtered_Ih_table.calc_nh(), sigmaprime) norm = normal_distribution() n = len(delta_hl) if n <= 10: a = 3 / 8 else: a = 0.5 self.sortedy = flex.sorted(flex.double(delta_hl)) self.sortedx = flex.double( [norm.quantile((i + 1 - a) / (n + 1 - (2 * a))) for i in range(n)]) central_sel = (self.sortedx < 1.5) & (self.sortedx > -1.5) self.sortedx = self.sortedx.select(central_sel) self.sortedy = self.sortedy.select(central_sel)
def qq_plot_against_normal(f_name, plot_vals): """Sort and plot list of values against expected quantiles from a normal distribution""" from scitbx.math.distributions import normal_distribution fig = pyplot.figure() pyplot.title('Q-Q plot for map values against normal distribution') expected_vals = normal_distribution().quantiles(len(plot_vals)) pyplot.plot([min(expected_vals) - 1, max(expected_vals) + 1], [min(expected_vals) - 1, max(expected_vals) + 1], 'b--') pyplot.plot(sorted(plot_vals), expected_vals, 'go-') pyplot.xlabel('Observed quantiles') pyplot.ylabel('Theoretical quantiles') #pyplot.tight_layout() pyplot.subplots_adjust() pyplot.savefig(f_name) pyplot.close(fig)
def calculate_delta_rankits(self): '''Implement expression (12) of Brewster2019''' # Get the base global index for this rank's deltas. Example: if rank 0 has 10 deltas, the first delta on rank 1 will be the 10th global delta. delta_count_per_rank = self.mpi_helper.comm.allreduce([self.deltas.size()]) base_delta_index = sum(delta_count_per_rank[0:self.mpi_helper.rank]) self.logger.log("Delta base index: %d"%base_delta_index) from scitbx.math import distributions import numpy as np norm = distributions.normal_distribution() a = 3./8. if self.global_delta_count < 10. else 0.5 self.rankits = flex.double() for i in range(self.deltas.size()): global_delta_index = base_delta_index + i rankit = norm.quantile((global_delta_index+1-a)/(self.global_delta_count+1-(2*a))) self.rankits.append(rankit)
def del_anom_normal_plot(intensities, strong_cutoff=0.0): """Make a normal probability plot of the normalised anomalous differences.""" diff_array = intensities.anomalous_differences() if not diff_array.data().size(): return {} delta = diff_array.data() / diff_array.sigmas() norm = distributions.normal_distribution() n = len(delta) if n <= 10: a = 3 / 8 else: a = 0.5 y = flex.sorted(delta) x = [norm.quantile((i + 1 - a) / (n + 1 - (2 * a))) for i in range(n)] H, xedges, yedges = np.histogram2d(np.array(x), y.as_numpy_array(), bins=(200, 200)) nonzeros = np.nonzero(H) z = np.empty(H.shape) z[:] = np.NAN z[nonzeros] = H[nonzeros] # also make a histogram histy = flex.histogram(y, n_slots=100) # make a gaussian for reference also n = y.size() width = histy.slot_centers()[1] - histy.slot_centers()[0] gaussian = [] from math import exp, pi for x in histy.slot_centers(): gaussian.append(n * width * exp(-(x**2) / 2.0) / ((2.0 * pi)**0.5)) title = "Normal probability plot of anomalous differences" plotname = "normal_distribution_plot" if strong_cutoff > 0.0: title += " (d > %.2f)" % strong_cutoff plotname += "_lowres" else: title += " (all data)" plotname += "_highres" return { plotname: { "data": [ { "x": xedges.tolist(), "y": yedges.tolist(), "z": z.transpose().tolist(), "type": "heatmap", "name": "normalised deviations", "colorbar": { "title": "Number of reflections", "titleside": "right", }, "colorscale": "Jet", }, { "x": [-5, 5], "y": [-5, 5], "type": "scatter", "mode": "lines", "name": "z = m", "color": "rgb(0,0,0)", }, ], "layout": { "title": title, "xaxis": { "anchor": "y", "title": "expected delta", "range": [-4, 4], }, "yaxis": { "anchor": "x", "title": "observed delta", "range": [-5, 5], }, }, "help": """\ This plot shows the normalised anomalous differences, sorted in order and plotted against the expected order based on a normal distribution model. A true normal distribution of deviations would give the straight line indicated. [1] P. L. Howell and G. D. Smith, J. Appl. Cryst. (1992). 25, 81-86 https://doi.org/10.1107/S0021889891010385 [2] P. Evans, Acta Cryst. (2006). D62, 72-82 https://doi.org/10.1107/S0907444905036693 """, } }
def exercise(self, debug=False): if debug: distribution = distributions.normal_distribution() observed_deviations = ( self.fo2.data() - self.scale_factor*self.fc.as_intensity_array().data()) observed_deviations = observed_deviations.select( flex.sort_permutation(observed_deviations)) expected_deviations = distribution.quantiles(observed_deviations.size()) csv_utils.writer( open('delta_F_npp.csv', 'wb'), (expected_deviations, observed_deviations)) # first with the correct absolute structure gaussian = absolute_structure.hooft_analysis(self.fo2, self.fc) analyses = [gaussian] NPP = absolute_structure.bijvoet_differences_probability_plot(gaussian) if self.use_students_t_errors: nu_calc = absolute_structure.maximise_students_t_correlation_coefficient( NPP.y, min_nu=1, max_nu=200) t_analysis = absolute_structure.students_t_hooft_analysis( self.fo2, self.fc, nu_calc, probability_plot_slope=NPP.fit.slope()) analyses.append(gaussian) tPP = absolute_structure.bijvoet_differences_probability_plot( t_analysis, use_students_t_distribution=True, students_t_nu=nu_calc) if tPP.distribution.degrees_of_freedom() < 100: assert tPP.correlation.coefficient() > NPP.correlation.coefficient() else: assert approx_equal(NPP.correlation.coefficient(), 1, 0.005) for analysis in analyses: assert approx_equal(analysis.hooft_y, 0, 1e-2) assert approx_equal(analysis.p2_true, 1) assert approx_equal(analysis.p2_false, 0) assert approx_equal(analysis.p3_true, 1) assert approx_equal(analysis.p3_false, 0) assert approx_equal(analysis.p3_racemic_twin, 0) if debug: csv_utils.writer(open('npp.csv', 'wb'), (NPP.x,NPP.y)) if self.use_students_t_errors: csv_utils.writer(open('tpp.csv', 'wb'), (tPP.x,tPP.y)) assert approx_equal(NPP.fit.y_intercept(), 0) # and now with the wrong absolute structure gaussian = absolute_structure.hooft_analysis(self.fo2, self.fc_i) analyses = [gaussian] NPP = absolute_structure.bijvoet_differences_probability_plot(gaussian) if self.use_students_t_errors: nu_calc = absolute_structure.maximise_students_t_correlation_coefficient( NPP.y, min_nu=1, max_nu=200) t_analysis = absolute_structure.students_t_hooft_analysis( self.fo2, self.fc_i, nu_calc, probability_plot_slope=NPP.fit.slope()) analyses.append(gaussian) tPP = absolute_structure.bijvoet_differences_probability_plot( t_analysis, use_students_t_distribution=True) if tPP.distribution.degrees_of_freedom() < 100: assert tPP.correlation.coefficient() > NPP.correlation.coefficient() else: assert approx_equal(NPP.correlation.coefficient(), 1, 0.002) assert approx_equal(NPP.fit.y_intercept(), 0) for analysis in analyses: assert approx_equal(analysis.hooft_y, 1, 1e-2) assert approx_equal(analysis.p2_true, 0) assert approx_equal(analysis.p2_false, 1) assert approx_equal(analysis.p3_true, 0) assert approx_equal(analysis.p3_false, 1) assert approx_equal(analysis.p3_racemic_twin, 0) # test for the case of a racemic twin gaussian = absolute_structure.hooft_analysis(self.fo2_twin, self.fc) analyses = [gaussian] NPP = absolute_structure.bijvoet_differences_probability_plot(gaussian) if self.use_students_t_errors: nu_calc = absolute_structure.maximise_students_t_correlation_coefficient( NPP.y, min_nu=1, max_nu=200) t_analysis = absolute_structure.students_t_hooft_analysis( self.fo2_twin, self.fc, nu_calc, probability_plot_slope=NPP.fit.slope()) tPP = absolute_structure.bijvoet_differences_probability_plot( t_analysis, use_students_t_distribution=True) if tPP.distribution.degrees_of_freedom() < 100: assert tPP.correlation.coefficient() > NPP.correlation.coefficient() else: assert approx_equal(NPP.correlation.coefficient(), 1, 0.002) assert approx_equal(NPP.fit.y_intercept(), 0) for analysis in analyses: assert approx_equal(analysis.hooft_y, 0.5, 1e-2) assert approx_equal(analysis.p3_true, 0) assert approx_equal(analysis.p3_false, 0) assert approx_equal(analysis.p3_racemic_twin, 1)
def normal_probability_plot(data): """Plot the distribution of normal probabilities of errors.""" norm = distributions.normal_distribution() n = len(data["delta_hl"]) if n <= 10: a = 3 / 8 else: a = 0.5 y = flex.sorted(flex.double(data["delta_hl"])) x = [norm.quantile((i + 1 - a) / (n + 1 - (2 * a))) for i in range(n)] H, xedges, yedges = np.histogram2d(np.array(x), y.as_numpy_array(), bins=(200, 200)) nonzeros = np.nonzero(H) z = np.empty(H.shape) z[:] = np.NAN z[nonzeros] = H[nonzeros] # also make a histogram histy = flex.histogram(y, n_slots=100) # make a gaussian for reference also n = y.size() width = histy.slot_centers()[1] - histy.slot_centers()[0] gaussian = [ n * width * math.exp(-(sc**2) / 2.0) / ((2.0 * math.pi)**0.5) for sc in histy.slot_centers() ] return { "normal_distribution_plot": { "data": [ { "x": xedges.tolist(), "y": yedges.tolist(), "z": z.transpose().tolist(), "type": "heatmap", "name": "normalised deviations", "colorbar": { "title": "Number of reflections", "titleside": "right", }, "colorscale": "Jet", }, { "x": [-5, 5], "y": [-5, 5], "type": "scatter", "mode": "lines", "name": "z = m", "color": "rgb(0,0,0)", }, ], "layout": { "title": "Normal probability plot with error model applied", "xaxis": { "anchor": "y", "title": "Order statistic medians, m" }, "yaxis": { "anchor": "x", "title": "Ordered responses, z" }, }, "help": """\ This plot shows the normalised devations (of each reflection from the group-weighted mean), sorted in order and plotted against the expected order based on a normal distribution model. A true normal distribution of deviations would give the straight line indicated. If the errors are well described by this model, the ordered responses should closely fit the straight line to high absolute values of x (>3), where there is typically a deviation away from the line due to wide tails of the distribution. """, }, "nor_dev_hist": { "data": [ { "x": list(histy.slot_centers()), "y": list(histy.slots()), "type": "bar", "name": "dataset normalised deviations", }, { "x": list(histy.slot_centers()), "y": gaussian, "type": "scatter", "name": "Ideal normal distribution", }, ], "layout": { "title": "Normal deviations with error model applied", "xaxis": { "anchor": "y", "title": "Normalised deviation" }, "yaxis": { "anchor": "x", "title": "Number of reflections" }, }, "help": """\ This plot shows the distribution of normalised devations (of each reflection from the group-weighted mean), for the reflections used to minimise the error model. A true normal distribution is indicated. """, }, }