-
Notifications
You must be signed in to change notification settings - Fork 0
/
merging_statistics.py
826 lines (788 loc) · 33.6 KB
/
merging_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
"""
Routines for calculating common metrics of data quality based on merging of
redundant observations.
"""
from __future__ import division
from iotbx import data_plots
from libtbx.str_utils import make_sub_header, format_value
from libtbx.utils import Sorry, null_out
from libtbx import group_args, Auto
from math import sqrt
import cStringIO
import sys
citations_str = """\
Diederichs K & Karplus PA (1997) Nature Structural Biology 4:269-275
(with erratum in: Nat Struct Biol 1997 Jul;4(7):592)
Weiss MS (2001) J Appl Cryst 34:130-135.
Karplus PA & Diederichs K (2012) Science 336:1030-3."""
sigma_filtering_phil_str = """
sigma_filtering = *auto xds scala scalepack
.type = choice
.short_caption = Sigma(I) filtering convention
.help = Determines how data are filtered by SigmaI and I/SigmaI. XDS \
discards reflections whose intensity after merging is less than -3*sigma, \
Scalepack uses the same cutoff before merging, and SCALA does not do any \
filtering. Reflections with negative SigmaI will always be discarded.
"""
merging_params_str = """
high_resolution = None
.type = float
.input_size = 64
low_resolution = None
.type = float
.input_size = 64
n_bins = 10
.type = int
.short_caption = Number of resolution bins
.input_size = 64
.style = spinner
extend_d_max_min = False
.type = bool
.expert_level = 2
anomalous = False
.type = bool
.short_caption = Keep anomalous pairs separate in merging statistics
%s
""" % sigma_filtering_phil_str
class model_based_arrays (object) :
"""
Container for observed and calculated intensities, along with the selections
for work and free sets; these should be provided by mmtbx.f_model. It is
assumed (or hoped) that the resolution range of these arrays will be
the same as that of the unmerged data, but the current implementation does
not force this.
"""
def __init__ (self, f_obs, i_obs, i_calc, work_sel, free_sel) :
assert (i_obs.data().size() == i_calc.data().size() ==
work_sel.data().size() == free_sel.data().size())
assert (len(f_obs.data()) <= len(i_obs.data()))
self.f_obs = f_obs
self.i_obs = i_obs.common_set(other=self.f_obs)
self.i_calc = i_calc.common_set(other=self.f_obs)
self.work_sel = work_sel.common_set(other=self.f_obs)
self.free_sel = free_sel.common_set(other=self.f_obs)
def cc_work_and_free (self, other) :
"""
Given a unique array of arbitrary resolution range, extract the equivalent
reflections from the observed and calculated intensities, and calculate
CC and R-factor for work and free sets. Currently, these statistics will
be None if there are no matching reflections.
"""
assert (self.i_obs.is_similar_symmetry(other))
i_obs_sel = self.i_obs.common_set(other=other)
f_obs_sel = self.f_obs.common_set(other=other)
i_calc_sel = self.i_calc.common_set(other=other)
work_sel = self.work_sel.common_set(other=other)
free_sel = self.free_sel.common_set(other=other)
if (len(i_obs_sel.data()) == 0) : # XXX should this raise an error?
return [None] * 4
i_obs_work = i_obs_sel.select(work_sel.data())
i_calc_work = i_calc_sel.select(work_sel.data())
i_obs_free = i_obs_sel.select(free_sel.data())
i_calc_free = i_calc_sel.select(free_sel.data())
f_obs_work = f_obs_sel.select(work_sel.data())
f_obs_free = f_obs_sel.select(free_sel.data())
if (len(f_obs_work.data()) > 0) and (len(f_obs_free.data()) > 0) :
from scitbx.array_family import flex
cc_work = flex.linear_correlation(i_obs_work.data(),
i_calc_work.data()).coefficient()
cc_free = flex.linear_correlation(i_obs_free.data(),
i_calc_free.data()).coefficient()
r_work = f_obs_work.r1_factor(i_calc_work.f_sq_as_f())
r_free = f_obs_free.r1_factor(i_calc_free.f_sq_as_f())
return cc_work, cc_free, r_work, r_free
return [None] * 4
def get_filtering_convention (i_obs, sigma_filtering=Auto) :
info = i_obs.info()
if (sigma_filtering in [Auto, "auto"]) :
if (info.source_type == "xds_ascii") :
sigma_filtering = "xds"
elif (info.source_type == "ccp4_mtz") :
sigma_filtering = "scala"
elif (info.source_type == "scalepack_no_merge_original_index") :
sigma_filtering = "scalepack"
else : # XXX default to the most conservative method
sigma_filtering = "scala"
return sigma_filtering
class filter_intensities_by_sigma (object) :
"""
Wrapper for filtering intensities based on one of several different
conventions:
- in XDS, reflections where I < -3*sigmaI after merging are deleted from
both the merged and unmerged arrays
- in Scalepack, the filtering is done before merging
- SCALA and AIMLESS do not do any filtering
note that ctruncate and cctbx.french_wilson (any others?) do their own
filtering, e.g. discarding I < -4*sigma in cctbx.french_wilson.
"""
def __init__ (self, array, sigma_filtering=Auto) :
sigma_filtering = get_filtering_convention(array, sigma_filtering)
assert (sigma_filtering in ["scala","scalepack","xds", None])
self.n_rejected_before_merge = self.n_rejected_after_merge = 0
merge = array.merge_equivalents()
array_merged = merge.array()
reject_sel = None
self.observed_criterion_sigma_I = None
if (sigma_filtering == "xds") :
self.observed_criterion_sigma_I = -3
reject_sel = (array_merged.data() < -3*array_merged.sigmas())
self.n_rejected_after_merge = reject_sel.count(True)
bad_data = array_merged.select(reject_sel)
array = array.delete_indices(other=bad_data)
# and merge again...
merge = array.merge_equivalents()
array_merged = merge.array()
elif (sigma_filtering == "scalepack") :
self.observed_criterion_sigma_I = -3
reject_sel = (array.data() < -3* array.sigmas())
self.n_rejected_before_merge = reject_sel.count(True)
array = array.select(~reject_sel)
merge = array.merge_equivalents()
array_merged = merge.array()
elif (sigma_filtering == "scala") or (sigma_filtering is None) :
pass
else :
raise ValueError("Unrecognized sigmaI filtering convention '%s'." %
sigma_filtering)
self.array = array
self.merge = merge
self.array_merged = array_merged
class merging_stats (object) :
"""
Calculate standard merging statistics for (scaled) unmerged data. Usually
these statistics will consider I(+) and I(-) as observations of the same
reflection, but these can be kept separate instead if desired.
Reflections with negative sigmas will be discarded, and depending on the
program we're trying to mimic, excessively negative intensities.
"""
def __init__ (self,
array,
d_max_min=None,
model_arrays=None,
anomalous=False,
debug=None,
sigma_filtering="scala") :
import cctbx.miller
from scitbx.array_family import flex
assert (array.sigmas() is not None)
array = array.eliminate_sys_absent()
non_negative_sel = array.sigmas() >= 0
self.n_neg_sigmas = non_negative_sel.count(False)
positive_sel = array.sigmas() > 0
self.n_zero_sigmas = positive_sel.count(False) - self.n_neg_sigmas
array = array.select(positive_sel)
# calculate CC(anom) first, because the default behavior is to switch to
# non-anomalous data for the rest of the analyses
self.anom_half_corr = array.half_dataset_anomalous_correlation()
array = array.customized_copy(anomalous_flag=anomalous).map_to_asu()
array = array.sort("packed_indices")
filter = filter_intensities_by_sigma(
array=array,
sigma_filtering=sigma_filtering)
if (d_max_min is None) :
d_max_min = array.d_max_min()
self.d_max, self.d_min = d_max_min
self.observed_criterion_sigma_I = filter.observed_criterion_sigma_I
array = filter.array
merge = filter.merge
array_merged = filter.array_merged
self.n_rejected_before_merge = filter.n_rejected_before_merge
self.n_rejected_after_merge = filter.n_rejected_after_merge
self.n_obs = array.indices().size()
self.n_uniq = array_merged.indices().size()
complete_set = array_merged.complete_set().resolution_filter(
d_min=self.d_min, d_max=self.d_max)
if (self.n_uniq == 0) :
complete_set = cctbx.miller.build_set(
crystal_symmetry=array_merged,
anomalous_flag=anomalous,
d_min=self.d_min).resolution_filter(d_min=self.d_min, d_max=self.d_max)
n_expected = len(complete_set.indices())
if (n_expected == 0) :
raise RuntimeError(("No reflections within specified resolution range "+
"(%g - %g)") % (self.d_max, self.d_min))
self.completeness = min(self.n_uniq / n_expected, 1.)
self.anom_completeness = None
# TODO also calculate when anomalous=False, since it is customary to
# calculate merging statistics with F+ and F- treated as redundant
# observations even when we're going to keep them separate.
if (anomalous) :
self.anom_completeness = array_merged.anomalous_completeness()
redundancies = merge.redundancies().data()
self.redundancies = {}
self.mean_redundancy = 0
self.i_mean = 0
self.sigi_mean = 0
self.i_over_sigma_mean = 0
self.i_mean_over_sigi_mean = 0
self.cc_one_half = 0
self.cc_star = 0
self.r_merge = self.r_meas = self.r_pim = None
for x in sorted(set(redundancies)) :
self.redundancies[x] = redundancies.count(x)
if (self.n_uniq > 0) :
self.mean_redundancy = flex.mean(redundancies.as_double())
self.i_mean = flex.mean(array_merged.data())
self.sigi_mean = flex.mean(array_merged.sigmas())
nonzero_array = array_merged.select(array_merged.sigmas() > 0)
i_over_sigma = nonzero_array.data() / nonzero_array.sigmas()
self.i_over_sigma_mean = flex.mean(i_over_sigma)
self.i_mean_over_sigi_mean = self.i_mean/self.sigi_mean
self.r_merge = merge.r_merge()
self.r_meas = merge.r_meas()
self.r_pim = merge.r_pim()
self.cc_one_half = cctbx.miller.compute_cc_one_half(
unmerged=array)
if (self.cc_one_half == 0) :
self.cc_star = 0
elif (self.cc_one_half < -0.999) :
self.cc_star = float("-inf")
else :
mult = 1.
if (self.cc_one_half < 0) :
mult = -1.
self.cc_star = mult * sqrt((2*abs(self.cc_one_half)) /
(1 + self.cc_one_half))
self.cc_work = self.cc_free = self.r_work = self.r_free = None
if (model_arrays is not None) and (self.n_uniq > 0) :
self.cc_work, self.cc_free, self.r_work, self.r_free = \
model_arrays.cc_work_and_free(array_merged)
@property
def cc_anom (self) :
return getattr(self, "anom_half_corr", None)
def format (self) :
return "%6.2f %6.2f %6d %6d %5.2f %6.2f %8.1f %6.1f %s %s %s %5.3f %5.3f" % (
self.d_max,
self.d_min,
self.n_obs,
self.n_uniq,
self.mean_redundancy,
self.completeness*100,
self.i_mean,
self.i_over_sigma_mean,
format_value("%5.3f", self.r_merge),
format_value("%5.3f", self.r_meas),
format_value("%5.3f", self.r_pim),
self.cc_one_half,
self.anom_half_corr)
def format_for_model_cc (self) :
return "%6.2f %6.2f %6d %6.2f %6.2f %5.3f %5.3f %s %s %s %s"%(
self.d_max, self.d_min, self.n_uniq,
self.completeness*100, self.i_over_sigma_mean,
self.cc_one_half, self.cc_star,
format_value("%5.3f", self.cc_work), format_value("%5.3f", self.cc_free),
format_value("%5.3f", self.r_work), format_value("%5.3f", self.r_free))
def format_for_gui (self) :
return [ "%.2f - %.2f" % (self.d_max, self.d_min),
str(self.n_obs),
str(self.n_uniq),
"%.1f" % self.mean_redundancy,
"%.1f %%" % (self.completeness * 100),
"%.1f" % self.i_over_sigma_mean,
"%.3f" % self.r_merge,
"%.3f" % self.r_meas,
"%.3f" % self.r_pim,
"%.3f" % self.cc_one_half ]
def format_for_cc_star_gui (self) :
return [ "%.2f - %.2f" % (self.d_max, self.d_min),
str(self.n_uniq),
"%.1f %%" % (self.completeness * 100),
"%.1f" % self.i_over_sigma_mean,
"%.3f" % self.cc_one_half,
"%.3f" % self.cc_star,
format_value("%5.3f", self.cc_work),
format_value("%5.3f", self.cc_free),
format_value("%5.3f", self.r_work),
format_value("%5.3f", self.r_free) ]
def table_data (self) :
table = [(1/self.d_min**2), self.n_obs, self.n_uniq, self.mean_redundancy,
self.completeness*100, self.i_mean, self.i_over_sigma_mean,
self.r_merge, self.r_meas, self.r_pim, self.cc_one_half,
self.anom_half_corr]
if (self.cc_work is not None) :
table.extend([self.cc_star, self.cc_work, self.cc_free, self.r_work,
self.r_free])
return table
def show_summary (self, out=sys.stdout, prefix="") :
print >> out, prefix+"Resolution: %.2f - %.2f" % (self.d_max, self.d_min)
print >> out, prefix+"Observations: %d" % self.n_obs
print >> out, prefix+"Unique reflections: %d" % self.n_uniq
print >> out, prefix+"Redundancy: %.1f" % self.mean_redundancy
print >> out, prefix+"Completeness: %.2f%%" % (self.completeness*100)
print >> out, prefix+"Mean intensity: %.1f" % self.i_mean
print >> out, prefix+"Mean I/sigma(I): %.1f" % self.i_over_sigma_mean
# negative sigmas are rejected before merging
if (self.n_neg_sigmas > 0) :
print >> out, prefix+"SigI < 0 (rejected): %d observations" % \
self.n_neg_sigmas
# excessively negative intensities can be rejected either before or after
# merging, depending on convention used
if (self.n_rejected_before_merge > 0) :
print >> out, prefix+"I < -3*SigI (rejected): %d observations" % \
self.n_rejected_before_merge
if (self.n_rejected_after_merge > 0) :
print >> out, prefix+"I < -3*SigI (rejected): %d reflections" % \
self.n_rejected_after_merge
print >> out, prefix+"R-merge: %5.3f" % self.r_merge
print >> out, prefix+"R-meas: %5.3f" % self.r_meas
print >> out, prefix+"R-pim: %5.3f" % self.r_pim
class dataset_statistics (object) :
"""
Container for overall and by-shell merging statistics, plus a table_data
object suitable for displaying graphs (or outputting loggraph format).
"""
def __init__ (self,
i_obs,
crystal_symmetry=None,
d_min=None,
d_max=None,
anomalous=False,
n_bins=10,
debug=False,
file_name=None,
model_arrays=None,
sigma_filtering=Auto,
d_min_tolerance=1.e-6,
extend_d_max_min=False,
log=None) :
self.file_name = file_name
if (log is None) : log = null_out()
assert (i_obs.sigmas() is not None)
info = i_obs.info()
sigma_filtering = get_filtering_convention(i_obs, sigma_filtering)
if (crystal_symmetry is None) :
assert (i_obs.space_group() is not None)
crystal_symmetry = i_obs.crystal_symmetry()
self.crystal_symmetry = crystal_symmetry
i_obs = i_obs.customized_copy(
crystal_symmetry=crystal_symmetry).set_info(info)
if (i_obs.is_unique_set_under_symmetry()) :
raise Sorry(("The data in %s are already merged. Only unmerged (but "+
"scaled) data may be used in this program.")%
i_obs.info().label_string())
d_min_cutoff = d_min
d_max_cutoff = d_max
if (d_min is not None) :
d_min_cutoff *= (1-d_min_tolerance)
if (d_max is not None) :
assert (d_max > d_min)
if (d_max is not None) :
d_max_cutoff *= 1+d_min_tolerance
i_obs = i_obs.resolution_filter(
d_min=d_min_cutoff,
d_max=d_max_cutoff).set_info(info)
if (i_obs.size() == 0) :
raise Sorry("No reflections left after applying resolution cutoffs.")
i_obs.show_summary(f=log)
self.anom_extra = ""
if (not anomalous) :
i_obs = i_obs.customized_copy(anomalous_flag=False).set_info(info)
self.anom_extra = " (non-anomalous)"
overall_d_max_min = None
if extend_d_max_min :
i_obs.setup_binner(
n_bins=n_bins,
d_max=d_max_cutoff,
d_min=d_min_cutoff)
overall_d_max_min = d_max_cutoff, d_min_cutoff
else :
i_obs.setup_binner(n_bins=n_bins)
merge = i_obs.merge_equivalents()
self.overall = merging_stats(i_obs,
d_max_min=overall_d_max_min,
model_arrays=model_arrays,
anomalous=anomalous,
debug=debug,
sigma_filtering=sigma_filtering)
self.bins = []
title = "Intensity merging statistics"
column_labels = ["1/d**2","N(obs)","N(unique)","Redundancy","Completeness",
"Mean(I)", "Mean(I/sigma)", "R-merge", "R-meas", "R-pim", "CC1/2",
"CC(anom)"]
graph_names = ["Reflection counts", "Redundancy", "Completeness",
"Mean(I)", "Mean(I/sigma)", "R-factors", "CC1/2", "CC(anom)"]
graph_columns = [[0,1,2],[0,3],[0,4],[0,5],[0,6],[0,7,8,9],[0,10],[0,11]]
#--- CC* mode
if (model_arrays is not None) :
title = "Model quality and intensity merging statistics"
column_labels.extend(["CC*", "CC(work)", "CC(free)", "R-work", "R-free"])
graph_names.extend(["CC*", "Model R-factors"])
graph_columns.extend([[0,11,12,13],[0,14,15]])
#---
self.table = data_plots.table_data(
title=title,
column_labels=column_labels,
graph_names=graph_names,
graph_columns=graph_columns,
x_is_inverse_d_min=True,
force_exact_x_labels=True)
last_bin = None
for bin in i_obs.binner().range_used() :
sele_unmerged = i_obs.binner().selection(bin)
bin_stats = merging_stats(i_obs.select(sele_unmerged),
d_max_min=i_obs.binner().bin_d_range(bin),
model_arrays=model_arrays,
anomalous=anomalous,
debug=debug,
sigma_filtering=sigma_filtering)
self.bins.append(bin_stats)
self.table.add_row(bin_stats.table_data())
@property
def signal_table (self) :
column_labels = ["1/d**2","N(obs)","N(unique)","Redundancy","Completeness",
"Mean(I)", "Mean(I/sigma)", ]
graph_names = ["Reflection counts", "Redundancy", "Completeness",
"Mean(I)", "Mean(I/sigma)",]
graph_columns = [[0,1,2],[0,3],[0,4],[0,5],[0,6],]
table = data_plots.table_data(
title="Statistics for redundancy, completeness, and signal",
column_labels=column_labels,
graph_names=graph_names,
graph_columns=graph_columns,
column_formats=["%6.2f","%6d","%6d","%5.2f","%6.2f","%8.1f","%6.1f"],
x_is_inverse_d_min=True,
force_exact_x_labels=True)
for bin in self.bins :
data = bin.table_data()
table.add_row(data[0:7])
return table
@property
def quality_table (self) :
column_labels = ["1/d**2", "R-merge", "R-meas", "R-pim", "CC1/2",
"CC(anom)"]
graph_columns = [[0,1,2,3],[0,4],[0,5]]
graph_names = ["R-factors", "CC1/2", "CC(anom)"]
table = data_plots.table_data(
title="Statistics for dataset consistency",
column_labels=column_labels,
column_formats=["%6.2f","%5.3f", "%5.3f", "%5.3f", "%5.3f", "%5.3f"],
graph_names=graph_names,
graph_columns=graph_columns,
x_is_inverse_d_min=True,
force_exact_x_labels=True)
for bin in self.bins :
data = bin.table_data()
table.add_row([ data[0] ] + data[7:12])
return table
@property
def cc_anom_table (self) :
column_labels = ["1/d**2", "CC(anom)"]
graph_columns = [[0,1]]
graph_names = ["CC(anom)"]
table = data_plots.table_data(
title="Half-dataset anomalous correlation",
column_labels=["1/d**2", "CC(anom)"],
column_formats=["%6.2f", "%5.3f"],
graph_names=["CC(anom)"],
graph_columns=[[0,1]],
x_is_inverse_d_min=True,
force_exact_x_labels=True)
for bin in self.bins :
data = bin.table_data()
table.add_row([ (1/bin.d_min**2), bin.anom_half_corr ])
return table
def show_loggraph (self, out=None) :
if (out is None) : out = sys.stdout
print >> out, ""
print >> out, self.table.format_loggraph()
print >> out, ""
def show (self, out=None, header=True) :
if (out is None) : out = sys.stdout
if (header) :
make_sub_header("Merging statistics", out=out)
self.overall.show_summary(out)
print >> out, ""
print >> out, "Redundancies%s:" % self.anom_extra
n_obs = sorted(self.overall.redundancies.keys())
for x in n_obs :
print >> out, " %d : %d" % (x, self.overall.redundancies[x])
print >> out, ""
print >> out, """\
Statistics by resolution bin:
d_max d_min #obs #uniq mult. %comp <I> <I/sI> r_mrg r_meas r_pim cc1/2 cc_ano"""
for bin_stats in self.bins :
print >> out, bin_stats.format()
print >> out, self.overall.format()
def show_cc_star (self, out=None) :
make_sub_header("CC* and related statistics", out=out)
print >> out, """\
d_max d_min n_uniq compl. <I/sI> cc_1/2 cc* cc_work cc_free r_work r_free"""
for k, bin in enumerate(self.bins) :
print >> out, bin.format_for_model_cc()
print >> out, self.overall.format_for_model_cc()
def extract_outer_shell_stats (self) :
"""
For compatibility with iotbx.logfiles (which should probably now be
deprecated) and phenix.table_one
"""
shell = self.bins[-1]
return group_args(
d_max_min=(shell.d_max, shell.d_min),
n_refl=shell.n_uniq,
n_refl_all=shell.n_obs,
completeness=shell.completeness,
multiplicity=shell.mean_redundancy, # XXX bad
r_sym=shell.r_merge,
r_meas=shell.r_meas,
cc_one_half=shell.cc_one_half,
cc_star=shell.cc_star,
i_over_sigma=shell.i_over_sigma_mean)
def as_cif_block(self, cif_block=None):
import iotbx.cif.model
if cif_block is None:
cif_block = iotbx.cif.model.block()
observed_criterion_sigma_I = self.overall.observed_criterion_sigma_I
if observed_criterion_sigma_I is None:
observed_criterion_sigma_I = "?"
cif_block["_reflns.d_resolution_low"] = self.overall.d_max
cif_block["_reflns.d_resolution_high"] = self.overall.d_min
cif_block["_reflns.percent_possible_obs"] = self.overall.completeness * 100
cif_block["_reflns.pdbx_number_measured_all"] = self.overall.n_obs
cif_block["_reflns.number_obs"] = self.overall.n_uniq
cif_block["_reflns.pdbx_redundancy"] = self.overall.mean_redundancy
cif_block["_reflns.phenix_mean_I"] = self.overall.i_mean
cif_block["_reflns.pdbx_netI_over_sigmaI"] = self.overall.i_over_sigma_mean
cif_block["_reflns.pdbx_Rmerge_I_obs"] = self.overall.r_merge
cif_block["_reflns.pdbx_Rrim_I_obs"] = self.overall.r_meas
cif_block["_reflns.pdbx_Rpim_I_obs"] = self.overall.r_pim
cif_block["_reflns.phenix_cc_star"] = self.overall.cc_star
cif_block["_reflns.phenix_cc_1/2"] = self.overall.cc_one_half
cif_block["_reflns.observed_criterion_sigma_I"] = observed_criterion_sigma_I
cif_block["_reflns.observed_criterion_sigma_F"] = "?"
reflns_shell_loop = iotbx.cif.model.loop(header=(
"_reflns_shell.d_res_high",
"_reflns_shell.d_res_low",
"_reflns_shell.number_measured_obs",
"_reflns_shell.number_unique_obs",
"_reflns_shell.pdbx_redundancy",
"_reflns_shell.percent_possible_obs",
"_reflns_shell.phenix_mean_I",
"_reflns_shell.pdbx_netI_over_sigmaI_obs",
"_reflns_shell.meanI_over_sigI_obs",
"_reflns_shell.Rmerge_I_obs",
"_reflns_shell.pdbx_Rrim_I_obs",
"_reflns_shell.pdbx_Rpim_I_obs",
"_reflns_shell.phenix_cc_star",
"_reflns_shell.phenix_cc_1/2",
))
for bin_stats in self.bins:
reflns_shell_loop.add_row((
bin_stats.d_min,
bin_stats.d_max,
bin_stats.n_obs,
bin_stats.n_uniq,
bin_stats.mean_redundancy,
bin_stats.completeness*100,
bin_stats.i_mean,
bin_stats.i_over_sigma_mean,
bin_stats.i_mean_over_sigi_mean,
bin_stats.r_merge,
bin_stats.r_meas,
bin_stats.r_pim,
bin_stats.cc_star,
bin_stats.cc_one_half))
cif_block.add_loop(reflns_shell_loop)
return cif_block
def as_remark_200 (self, wavelength=None) :
from libtbx.test_utils import approx_equal
synchrotron = wl = "NULL"
if (wavelength is not None) :
out = cStringIO.StringIO()
# XXX somewhat risky...
if (not approx_equal(wavelength, 1.5418, eps=0.01, out=out) and
not approx_equal(wavelength, 0.7107, eps=0.01, out=out)) :
synchrotron = "Y"
else :
synchrotron = "N"
wl = "%.4f" % wavelength
lines = []
lines.append("")
lines.append("EXPERIMENTAL DETAILS")
lines.append(" EXPERIMENT TYPE : X-RAY DIFFRACTION")
lines.append(" DATE OF DATA COLLECTION : NULL")
lines.append(" TEMPERATURE (KELVIN) : NULL")
lines.append(" PH : NULL")
lines.append(" NUMBER OF CRYSTALS USED : NULL")
lines.append("")
lines.append(" SYNCHROTRON (Y/N) : NULL")
lines.append(" RADIATION SOURCE : NULL")
lines.append(" BEAMLINE : NULL")
lines.append(" X-RAY GENERATOR MODEL : NULL")
lines.append(" MONOCHROMATIC OR LAUE (M/L) : M")
lines.append(" WAVELENGTH OR RANGE (A) : %s" % wl)
lines.append(" MONOCHROMATOR : NULL")
lines.append(" OPTICS : NULL")
lines.append("")
lines.append(" DETECTOR TYPE : NULL")
lines.append(" DETECTOR MANUFACTURER : NULL")
lines.append(" INTENSITY-INTEGRATION SOFTWARE : NULL")
lines.append(" DATA SCALING SOFTWARE : NULL")
lines.append("")
lines.append("OVERALL.")
comp_overall = format_value("%.1f", self.overall.completeness * 100)
mult_overall = format_value("%.1f", self.overall.mean_redundancy)
rmerg_overall = format_value("%.5f", self.overall.r_merge)
s2n_overall = format_value("%.4f", self.overall.i_over_sigma_mean)
lines.append(" COMPLETENESS FOR RANGE (%%) : %s" % comp_overall)
lines.append(" DATA REDUNDANCY : %s" % mult_overall)
lines.append(" R MERGE (I) : %s" % rmerg_overall)
lines.append(" R SYM (I) : NULL")
lines.append(" <I/SIGMA(I)> FOR THE DATA SET : %s" % s2n_overall)
lines.append("")
lines.append("IN THE HIGHEST RESOLUTION SHELL.")
bin_stats = self.bins[-1]
d_max = format_value("%.2f", bin_stats.d_max)
d_min = format_value("%.2f", bin_stats.d_min)
comp_lastbin = format_value("%.1f", bin_stats.completeness * 100)
mult_lastbin = format_value("%.1f", bin_stats.mean_redundancy)
rmerg_lastbin = format_value("%.5f", bin_stats.r_merge)
s2n_lastbin = format_value("%.4f", bin_stats.i_over_sigma_mean)
lines.append(" HIGHEST RESOLUTION SHELL, RANGE HIGH (A) : %s" % d_min)
lines.append(" HIGHEST RESOLUTION SHELL, RANGE LOW (A) : %s" % d_max)
lines.append(" COMPLETENESS FOR SHELL (%%) : %s" % comp_lastbin)
lines.append(" DATA REDUNDANCY IN SHELL : %s" % mult_lastbin)
lines.append(" R MERGE FOR SHELL (I) : %s" % rmerg_lastbin)
lines.append(" R SYM FOR SHELL (I) : NULL")
lines.append(" <I/SIGMA(I)> FOR SHELL : %s" % s2n_lastbin)
lines.append("")
remark_lines = [ "REMARK 200 %s" % line for line in lines ]
return "\n".join(remark_lines)
def show_model_vs_data (self, out=None, prefix="") :
assert (self.overall.cc_work is not None)
if (out is None) : out = sys.stdout
outer_shell = self.bins[-1]
print >> out, prefix + "Merging statistics and CC*:"
print >> out, prefix + " Resolution : %.3f - %.3f (%.3f - %.3f)" % (
self.overall.d_max, self.overall.d_min, outer_shell.d_max,
outer_shell.d_min)
print >> out, prefix + " Mean(I/sigmaI) : %6.3f (%.3f)" % (
self.overall.i_over_sigma_mean, outer_shell.i_over_sigma_mean)
print >> out, prefix + " Redundancy : %4.2f (%.2f)" % (
self.overall.mean_redundancy, outer_shell.mean_redundancy)
print >> out, prefix + " R-merge : %5.3f (%.3f)" % (
self.overall.r_merge, outer_shell.r_merge)
print >> out, prefix + " R-meas : %5.3f (%.3f)" % (
self.overall.r_meas, outer_shell.r_meas)
print >> out, prefix + " R-pim : %5.3f (%.3f)" % (
self.overall.r_pim, outer_shell.r_pim)
print >> out, prefix + " CC1/2 : %5.3f (%.3f)" % (
self.overall.cc_one_half, outer_shell.cc_one_half)
print >> out, prefix + " CC* : %5.3f (%.3f)" % (
self.overall.cc_star, outer_shell.cc_star)
print >> out, prefix + " CC(work) : %6.4f (%.4f)" % (
self.overall.cc_work, outer_shell.cc_work)
if (self.overall.cc_free is not None) :
print >> out, prefix + " CC(free) : %6.4f (%.4f)" % (
self.overall.cc_free, outer_shell.cc_free)
else :
print >> out, prefix + " CC(free) : not available"
def estimate_d_min (self,
min_i_over_sigma=0,
min_cc_one_half=0,
max_r_merge=sys.maxint,
max_r_meas=sys.maxint,
min_cc_anom=-1,
min_completeness=0) :
"""
Determine approximate resolution cutoffs based on a variety of metrics.
Numbers are assumed to be fractional, not percentage values, except for
the completeness which will be treated as a percent if the cutoff is
greater than 1.
:param min_i_over_sigma: minimum Mean(I/sigmaI) for outer shell
:param min_cc_one_half: minimum CC1/2 for outer shell
:param max_r_merge: maximum R-merge for outer shell
:param max_r_meas: maximum R-meas for outer shell
:param min_cc_anom: minimum CC(anom) for outer shell
:param min_completeness: minimum completeness for outer shell
:returns: Python float representing d_min for the outermost acceptable
resolution bin, or None if no bins meet the given criteria
"""
if ([min_i_over_sigma,min_cc_one_half,max_r_merge,max_r_meas,min_cc_anom,
min_completeness].count(None) == 6) :
return None
if (min_completeness > 1) :
min_completeness /= 100.
d_min = None
last_bin = None
for bin in self.bins :
if ((bin.i_over_sigma_mean < min_i_over_sigma) or
(bin.cc_one_half < min_cc_one_half) or
((max_r_merge is not None) and (bin.r_merge > max_r_merge)) or
((max_r_meas is not None) and (bin.r_meas > max_r_meas)) or
(bin.cc_anom < min_cc_anom) or
(bin.completeness < min_completeness)) :
break
last_bin = bin
if (last_bin is None) :
return None
else :
return last_bin.d_min
def show_estimated_cutoffs (self, out=sys.stdout, prefix="") :
print >> out, ""
print >> out, ""
def format_d_min (value) :
if (value is None) :
return "(use all data)" #% self.d_min_overall
return "%7.3f" % value
make_sub_header("Resolution cutoff estimates", out=out)
print >> out, prefix + " resolution of all data : %7.3f" % \
self.overall.d_min
cc_one_half_cut = self.estimate_d_min(min_cc_one_half=0.33)
i_over_sigma_cut = self.estimate_d_min(min_i_over_sigma=2.0)
r_merge_cut = self.estimate_d_min(max_r_merge=0.5)
r_meas_cut = self.estimate_d_min(max_r_meas=0.5)
cc_anom_cut = self.estimate_d_min(min_cc_anom=0.3)
completeness_cut_conservative = self.estimate_d_min(min_completeness=0.9)
completeness_cut_permissive = self.estimate_d_min(min_completeness=0.5)
print >> out, prefix + " based on CC(1/2) >= 0.33 : %s" % \
format_d_min(cc_one_half_cut)
print >> out, prefix + " based on mean(I/sigma) >= 2.0 : %s" % \
format_d_min(i_over_sigma_cut)
print >> out, prefix + " based on R-merge < 0.5 : %s" % \
format_d_min(r_merge_cut)
print >> out, prefix + " based on R-meas < 0.5 : %s" % \
format_d_min(r_meas_cut)
print >> out, prefix + " based on completeness >= 90%% : %s" % \
format_d_min(completeness_cut_conservative)
print >> out, prefix + " based on completeness >= 50%% : %s" % \
format_d_min(completeness_cut_permissive)
print >> out, ""
print >> out, "NOTE: we recommend using all data out to the CC(1/2) limit"
print >> out, "for refinement."
def select_data (file_name, data_labels, log=None,
assume_shelx_observation_type_is=None, allow_amplitudes=None) :
if (log is None) : log = null_out()
from iotbx import reflection_file_reader
hkl_in = reflection_file_reader.any_reflection_file(file_name)
print >> log, "Format:", hkl_in.file_type()
miller_arrays = hkl_in.as_miller_arrays(merge_equivalents=False,
assume_shelx_observation_type_is=assume_shelx_observation_type_is)
if ((hkl_in.file_type() == "shelx_hklf") and (not "=" in file_name)
and assume_shelx_observation_type_is is None) :
print >> log, "WARNING: SHELX file is assumed to contain intensities"
i_obs = None
all_i_obs = []
for array in miller_arrays :
labels = array.info().label_string()
if (labels == data_labels) :
i_obs = array
break
elif (array.is_xray_intensity_array()) :
all_i_obs.append(array)
# if no intensities...try again with amplitudes
if (hkl_in.file_type() == "shelx_hklf" or allow_amplitudes) :
if (i_obs is None and len(all_i_obs)==0) :
for array in miller_arrays :
if (array.is_xray_amplitude_array()) :
all_i_obs.append(array.f_as_f_sq())
if (i_obs is None) :
if (len(all_i_obs) == 0) :
raise Sorry("No intensities found in %s." % file_name)
elif (len(all_i_obs) > 1) :
raise Sorry("Multiple intensity arrays - please specify one:\n%s" %
"\n".join([" labels=%s"%a.info().label_string() for a in all_i_obs]))
else :
i_obs = all_i_obs[0]
if (not i_obs.is_xray_intensity_array()) :
raise Sorry("%s is not an intensity array." % i_obs.info().label_string())
return i_obs