def npp(hklin): from iotbx.reflection_file_reader import any_reflection_file from xia2.Toolkit.NPP import npp_ify from scitbx.array_family import flex import math import sys reader = any_reflection_file(hklin) mtz_object = reader.file_content() intensities = [ ma for ma in reader.as_miller_arrays(merge_equivalents=False) if ma.info().labels == ['I', 'SIGI'] ][0] indices = intensities.indices() # merging: use external variance i.e. variances derived from SIGI column merger = intensities.merge_equivalents(use_internal_variance=False) mult = merger.redundancies().data() imean = merger.array() unique = imean.indices() iobs = imean.data() # scale up variance to account for sqrt(multiplicity) effective scaling variobs = (imean.sigmas()**2) * mult.as_double() all = flex.double() cen = flex.double() for hkl, i, v, m in zip(unique, iobs, variobs, mult): # only consider if meaningful number of observations if m < 3: continue sel = indices == hkl data = intensities.select(sel).data() assert (m == len(data)) _x, _y = npp_ify(data, input_mean_variance=(i, v)) # perform linreg on (i) all data and (ii) subset between +/- 2 sigma sel = (flex.abs(_x) < 2) _x_ = _x.select(sel) _y_ = _y.select(sel) fit_all = flex.linear_regression(_x, _y) fit_cen = flex.linear_regression(_x_, _y_) all.append(fit_all.slope()) cen.append(fit_cen.slope()) print('%3d %3d %3d' % hkl, '%.2f %.2f %.2f' % (i, v, i/math.sqrt(v)), \ '%.2f %.2f' % (fit_all.slope(), fit_cen.slope()), '%d' % m) sys.stderr.write('Mean gradients: %.2f %.2f\n' % (flex.sum(all) / all.size(), flex.sum(cen) / cen.size()))
def npp(hklin): from iotbx.reflection_file_reader import any_reflection_file from xia2.Toolkit.NPP import npp_ify, mean_variance from scitbx.array_family import flex import math import sys reader = any_reflection_file(hklin) mtz_object = reader.file_content() intensities = [ma for ma in reader.as_miller_arrays(merge_equivalents=False) if ma.info().labels == ['I', 'SIGI']][0] indices = intensities.indices() # merging: use external variance i.e. variances derived from SIGI column merger = intensities.merge_equivalents(use_internal_variance=False) mult = merger.redundancies().data() imean = merger.array() unique = imean.indices() iobs = imean.data() # scale up variance to account for sqrt(multiplicity) effective scaling variobs = (imean.sigmas() ** 2) * mult.as_double() all = flex.double() cen = flex.double() for hkl, i, v, m in zip(unique, iobs, variobs, mult): # only consider if meaningful number of observations if m < 3: continue sel = indices == hkl data = intensities.select(sel).data() assert(m == len(data)) _x, _y = npp_ify(data, input_mean_variance=(i,v)) # perform linreg on (i) all data and (ii) subset between +/- 2 sigma sel = (flex.abs(_x) < 2) _x_ = _x.select(sel) _y_ = _y.select(sel) fit_all = flex.linear_regression(_x, _y) fit_cen = flex.linear_regression(_x_, _y_) all.append(fit_all.slope()) cen.append(fit_cen.slope()) print '%3d %3d %3d' % hkl, '%.2f %.2f %.2f' % (i, v, i/math.sqrt(v)), \ '%.2f %.2f' % (fit_all.slope(), fit_cen.slope()), '%d' % m sys.stderr.write('Mean gradients: %.2f %.2f\n' % (flex.sum(all) / all.size(), flex.sum(cen) / cen.size()))
def test(): numbers = variate(poisson_distribution(mean = 1000)) data = flex.double() for j in range(1000): data.append(numbers.next()) _x, _y = npp_ify(data) fit = flex.linear_regression(_x, _y) fit.show_summary() _x, _y = npp_ify(data, input_mean_variance=(1000, 1000)) fit = flex.linear_regression(_x, _y) fit.show_summary()
def test(): numbers = variate(poisson_distribution(mean = 1000)) data = flex.double() for j in range(1000): data.append(next(numbers)) _x, _y = npp_ify(data) fit = flex.linear_regression(_x, _y) fit.show_summary() _x, _y = npp_ify(data, input_mean_variance=(1000, 1000)) fit = flex.linear_regression(_x, _y) fit.show_summary()
def compute_rg_from_data(self,q,i): q_sq = q*q ln_i = flex.log( i ) cc_obj = flex.linear_regression( q_sq, ln_i ) rg2 = -cc_obj.slope()*3.0 lni = cc_obj.y_intercept() return rg2, lni
def estimate_cc_sig_fac(self): # A1.1. Estimation of sigma(CC) as a function of sample size. binner = self.intensities.setup_binner_counting_sorted(reflections_per_bin=200) a = flex.double() b = flex.double() for i in range(binner.n_bins_all()): count = binner.counts()[i] if count == 0: continue bin_isel = binner.array_indices(i) p = flex.random_permutation(count) p = p[:2 * (count // 2)] # ensure even count a.extend(self.intensities.data().select(bin_isel.select(p[:count//2]))) b.extend(self.intensities.data().select(bin_isel.select(p[count//2:]))) perm = flex.random_selection(a.size(), min(20000, a.size())) a = a.select(perm) b = b.select(perm) self.corr_unrelated = CorrelationCoefficientAccumulator(a, b) n_pairs = a.size() min_num_groups = 10 # minimum number of groups max_n_group = int(min(n_pairs/min_num_groups, 200)) # maximum number in group min_n_group = int(min(5, max_n_group)) # minimum number in group mean_ccs = flex.double() rms_ccs = flex.double() ns = flex.double() for n in range(min_n_group, max_n_group): ns.append(n) ccs = flex.double() for i in range(200): isel = flex.random_selection(a.size(), n) corr = CorrelationCoefficientAccumulator(a.select(isel), b.select(isel)) ccs.append(corr.coefficient()) mean_ccs.append(flex.mean(ccs)) rms_ccs.append(flex.mean(flex.pow2(ccs))**0.5) x = 1/flex.pow(ns, 0.5) y = rms_ccs fit = flex.linear_regression(x, y) assert fit.is_well_defined() self.cc_sig_fac = fit.slope() if 0: from matplotlib import pyplot as plt plt.plot(x, y) plt.plot( plt.xlim(), [fit.slope() * x_ + fit.y_intercept() for x_ in plt.xlim()]) plt.show()
def get_r_free_stats (miller_array, test_flag_value) : from scitbx.array_family import flex array = get_r_free_as_bool(miller_array, test_flag_value) n_free = array.data().count(True) accu = array.sort(by_value="resolution").r_free_flags_accumulation() lr = flex.linear_regression(accu.reflection_counts.as_double(), accu.free_fractions) assert lr.is_well_defined() slope = lr.slope() y_ideal = accu.reflection_counts.as_double() * slope sse = 0 n_bins = 0 n_ref_last = 0 sse = flex.sum(flex.pow(y_ideal - accu.free_fractions, 2)) for x in accu.reflection_counts : if x > (n_ref_last + 1) : n_bins += 1 n_ref_last = x return (n_bins, n_free, sse, accu)
def get_r_free_stats(miller_array, test_flag_value): from scitbx.array_family import flex array = get_r_free_as_bool(miller_array, test_flag_value) n_free = array.data().count(True) accu = array.sort(by_value="resolution").r_free_flags_accumulation() lr = flex.linear_regression(accu.reflection_counts.as_double(), accu.free_fractions) assert lr.is_well_defined() slope = lr.slope() y_ideal = accu.reflection_counts.as_double() * slope sse = 0 n_bins = 0 n_ref_last = 0 sse = flex.sum(flex.pow(y_ideal - accu.free_fractions, 2)) for x in accu.reflection_counts: if x > (n_ref_last + 1): n_bins += 1 n_ref_last = x return (n_bins, n_free, sse, accu)
def find_delta(rho_map, tol): """ Find delta as hinted on fig. 1 of ref. [1] in module charge_flipping """ rho = rho_map.real_map_unpadded().as_1d() max_rho = flex.max(rho) rho /= max_rho sorting = flex.sort_permutation(rho) sorted_rho = rho.select(sorting) n = len(sorted_rho) p,q = n//4, 3*n//4 indexes = flex.double_range(p,q) values = sorted_rho[p:q] c = flex.linear_correlation(indexes, values) assert c.is_well_defined() and c.coefficient() > 0.99 r = flex.linear_regression(indexes, values) a,b = r.y_intercept(), r.slope() deviation = flex.abs(a + b*flex.double_range(n) - sorted_rho) non_linear_sel = deviation > tol low = flex.first_index(non_linear_sel, False) high = flex.last_index(non_linear_sel, False) assert non_linear_sel[low:high].count(False)/(high-low+1) > 0.99 assert sorted_rho[low] < 0 and sorted_rho[high] > 0 return min(sorted_rho[high], -sorted_rho[low]), max_rho
def _estimate_cc_sig_fac(self): """Estimation of sigma(CC) as a function of sample size. Estimate the error in the correlation coefficient, sigma(CC) by using pairs of reflections at similar resolutions that are not related by potential symmetry. Using pairs of unrelated reflections at similar resolutions, calculate sigma(CC) == rms(CC) for groups of size N = 3..200. The constant CCsigFac is obtained from a linear fit of sigma(CC) to 1/N^(1/2), i.e.: sigma(CC) = CCsigFac/N^(1/2) """ max_bins = 500 reflections_per_bin = max( 200, int(math.ceil(self.intensities.size() / max_bins))) binner = self.intensities.setup_binner_counting_sorted( reflections_per_bin=reflections_per_bin) a = flex.double() b = flex.double() ma_tmp = self.intensities.customized_copy( crystal_symmetry=crystal.symmetry( space_group=self.lattice_group, unit_cell=self.intensities.unit_cell(), assert_is_compatible_unit_cell=False, )).map_to_asu() for i in range(binner.n_bins_all()): count = binner.counts()[i] if count == 0: continue bin_isel = binner.array_indices(i) p = flex.random_permutation(count) p = p[:2 * (count // 2)] # ensure even count ma_a = ma_tmp.select(bin_isel.select(p[:count // 2])) ma_b = ma_tmp.select(bin_isel.select(p[count // 2:])) # only choose pairs of reflections that don't have the same indices # in the asu of the lattice group sel = ma_a.indices() != ma_b.indices() a.extend(ma_a.data().select(sel)) b.extend(ma_b.data().select(sel)) perm = flex.random_selection(a.size(), min(20000, a.size())) a = a.select(perm) b = b.select(perm) self.corr_unrelated = CorrelationCoefficientAccumulator(a, b) n_pairs = a.size() min_num_groups = 10 # minimum number of groups max_n_group = int(min(n_pairs / min_num_groups, 200)) # maximum number in group min_n_group = int(min(5, max_n_group)) # minimum number in group if (max_n_group - min_n_group) < 4: self.cc_sig_fac = 0 return mean_ccs = flex.double() rms_ccs = flex.double() ns = flex.double() for n in range(min_n_group, max_n_group + 1): ns.append(n) ccs = flex.double() for i in range(200): isel = flex.random_selection(a.size(), n) corr = CorrelationCoefficientAccumulator( a.select(isel), b.select(isel)) ccs.append(corr.coefficient()) mean_ccs.append(flex.mean(ccs)) rms_ccs.append(flex.mean(flex.pow2(ccs))**0.5) x = 1 / flex.pow(ns, 0.5) y = rms_ccs fit = flex.linear_regression(x, y) if fit.is_well_defined(): self.cc_sig_fac = fit.slope() else: self.cc_sig_fac = 0
def semisynthetic_variance_analysis(semisynthetic_integrated_data_files, value_column): import six.moves.cPickle as pickle import math from dials.array_family import flex from dials.util.add_hash import add_hash integrated_data_sets = [ pickle.load(open(data_file, "rb")) for data_file in semisynthetic_integrated_data_files ] # check column, find variance integrated_data = integrated_data_sets[0] assert value_column in integrated_data variance_column = None for column in [ "%s.variance" % value_column, value_column.replace("value", "variance"), ]: if column in integrated_data: variance_column = column break assert variance_column data = integrated_data[value_column] if hasattr(data, "parts"): multicolumn = len(data.parts()) else: multicolumn = 0 # first prepare the data files i.e. remove partials, keep only integrated # reflections, add the hash column, add weight column hash_set = None hashed_data_sets = [] for integrated_data in integrated_data_sets: size0 = integrated_data.size() if "intensity" in value_column: sel = integrated_data.get_flags(integrated_data.flags.integrated) integrated_data = integrated_data.select(sel) sel = integrated_data["partiality"] > 0.99 integrated_data = integrated_data.select(sel) elif "xyzobs" in value_column: sel = integrated_data.get_flags(integrated_data.flags.indexed) integrated_data = integrated_data.select(sel) integrated_data = add_hash(integrated_data) hashed_data_sets.append(integrated_data) if hash_set is None: hash_set = set(integrated_data["hash"]) else: hash_set = hash_set.intersection(set(integrated_data["hash"])) size1 = integrated_data.size() duplicate = [] for h in hash_set: # check for duplicates i.e. reflection at 0, 2pi for i in hashed_data_sets: sel = i["hash"] == h isel = sel.iselection() if len(isel) > 1: duplicate.append(h) for d in duplicate: hash_set.discard(d) # now analyse those reflections found to be in all data sets (here looking # at the profile fitted intensity and variance thereof) for h in hash_set: if not multicolumn: values = flex.double() variances = flex.double() for i in hashed_data_sets: sel = i["hash"] == h isel = sel.iselection() assert len(isel) == 1 values.append(i[isel[0]][value_column]) variances.append(i[isel[0]][variance_column]) weighted_mean, weighted_variance = weighted_mean_variance( values, variances) expected, scaled = npp(values, (weighted_mean, weighted_variance)) fit = flex.linear_regression(expected, scaled) # since I have everything needed to compute chi-square here... n = len(values) chi2 = (sum([((v - weighted_mean)**2) / weighted_variance for v in values]) / n) print("%.3f %.3f %.3f" % (weighted_mean / math.sqrt(weighted_variance), fit.slope(), chi2)) else: values = {} variances = {} for m in range(multicolumn): values[m] = flex.double() variances[m] = flex.double() for i in hashed_data_sets: sel = i["hash"] == h isel = sel.iselection() assert len(isel) == 1 data = i[isel[0]][value_column] variance = i[isel[0]][variance_column] for m in range(multicolumn): values[m].append(data[m]) variances[m].append(variance[m]) result = "" for m in range(multicolumn): weighted_mean, weighted_variance = weighted_mean_variance( values[m], variances[m]) expected, scaled = npp(values[m], (weighted_mean, weighted_variance)) fit = flex.linear_regression(expected, scaled) # since I have everything needed to compute chi-square here... n = len(values[m]) chi2 = (sum([((v - weighted_mean)**2) / weighted_variance for v in values[m]]) / n) result += "%f %.3f %.3f " % ( math.sqrt(weighted_variance), fit.slope(), chi2, ) print(result)
def semisynthetic_variance_analysis(semisynthetic_integrated_data_files, value_column): import cPickle as pickle import math from dials.array_family import flex from dials.util.add_hash import add_hash, dehash integrated_data_sets = [pickle.load(open(data_file, 'rb')) for data_file in semisynthetic_integrated_data_files] # check column, find variance integrated_data = integrated_data_sets[0] assert value_column in integrated_data variance_column = None for column in ['%s.variance' % value_column, value_column.replace('value', 'variance')]: if column in integrated_data: variance_column = column break assert(variance_column) data = integrated_data[value_column] if hasattr(data, 'parts'): multicolumn = len(data.parts()) else: multicolumn = 0 # first prepare the data files i.e. remove partials, keep only integrated # reflections, add the hash column, add weight column hash_set = None hashed_data_sets = [] for integrated_data in integrated_data_sets: size0 = integrated_data.size() if 'intensity' in value_column: sel = integrated_data.get_flags(integrated_data.flags.integrated) integrated_data = integrated_data.select(sel) sel = integrated_data['partiality'] > 0.99 integrated_data = integrated_data.select(sel) elif 'xyzobs' in value_column: sel = integrated_data.get_flags(integrated_data.flags.indexed) integrated_data = integrated_data.select(sel) integrated_data = add_hash(integrated_data) hashed_data_sets.append(integrated_data) if hash_set is None: hash_set = set(integrated_data['hash']) else: hash_set = hash_set.intersection(set(integrated_data['hash'])) size1 = integrated_data.size() duplicate = [] for h in hash_set: # check for duplicates i.e. reflection at 0, 2pi for i in hashed_data_sets: sel = i['hash'] == h isel = sel.iselection() if len(isel) > 1: duplicate.append(h) for d in duplicate: hash_set.discard(d) # now analyse those reflections found to be in all data sets (here looking # at the profile fitted intensity and variance thereof) for h in hash_set: if not multicolumn: values = flex.double() variances = flex.double() for i in hashed_data_sets: sel = i['hash'] == h isel = sel.iselection() assert(len(isel) == 1) values.append(i[isel[0]][value_column]) variances.append(i[isel[0]][variance_column]) weighted_mean, weighted_variance = weighted_mean_variance(values, variances) expected, scaled = npp(values, (weighted_mean, weighted_variance)) fit = flex.linear_regression(expected, scaled) # since I have everything needed to compute chi-square here... n = len(values) chi2 = sum([((v - weighted_mean) ** 2) / weighted_variance for v in values]) / n print '%.3f %.3f %.3f' % (weighted_mean / math.sqrt(weighted_variance), fit.slope(), chi2) else: values = { } variances = { } for m in range(multicolumn): values[m] = flex.double() variances[m] = flex.double() for i in hashed_data_sets: sel = i['hash'] == h isel = sel.iselection() assert(len(isel) == 1) data = i[isel[0]][value_column] variance = i[isel[0]][variance_column] for m in range(multicolumn): values[m].append(data[m]) variances[m].append(variance[m]) result = '' for m in range(multicolumn): weighted_mean, weighted_variance = weighted_mean_variance(values[m], variances[m]) expected, scaled = npp(values[m], (weighted_mean, weighted_variance)) fit = flex.linear_regression(expected, scaled) # since I have everything needed to compute chi-square here... n = len(values[m]) chi2 = sum([((v - weighted_mean) ** 2) / weighted_variance for v in values[m]]) / n result += '%f %.3f %.3f ' % (math.sqrt(weighted_variance), fit.slope(), chi2) print result